From 4baf0dbe742100e5ffe63f6fd19f92cb280f818c Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 14 Jul 2022 18:51:22 +0800 Subject: [PATCH] Compilation optimization (#44242) * Compilation optimization --- .../eager_generated/backwards/CMakeLists.txt | 4 +- .../eager_generated/forwards/CMakeLists.txt | 4 +- .../eager_manual/forwards/CMakeLists.txt | 17 +- .../manual/eager_manual/nodes/CMakeLists.txt | 13 +- .../fluid_manual/forwards/CMakeLists.txt | 29 +- .../manual/fluid_manual/nodes/CMakeLists.txt | 19 +- .../auto_code_generator/eager_generator.cc | 45 +- .../generate_file_structures.py | 17 +- .../framework/new_executor/CMakeLists.txt | 81 +- paddle/fluid/imperative/CMakeLists.txt | 19 +- paddle/fluid/operators/CMakeLists.txt | 2 +- .../platform/device/gpu/cuda/CMakeLists.txt | 2 +- paddle/fluid/pybind/.gitignore | 9 +- paddle/fluid/pybind/CMakeLists.txt | 122 +- .../fluid/pybind/generate_file_structures.py | 8 +- paddle/fluid/pybind/imperative.cc | 10 +- paddle/fluid/pybind/op_function.h | 9 +- paddle/fluid/pybind/op_function_generator.cc | 111 +- paddle/fluid/pybind/parallel_executor.cc | 1118 ++++++++ paddle/fluid/pybind/parallel_executor.h | 25 + paddle/fluid/pybind/place.cc | 816 ++++++ paddle/fluid/pybind/place.h | 25 + paddle/fluid/pybind/pybind.cc | 2447 +---------------- paddle/fluid/pybind/tensor.cc | 1106 ++++++++ paddle/fluid/pybind/tensor.h | 25 + 25 files changed, 3420 insertions(+), 2663 deletions(-) create mode 100644 paddle/fluid/pybind/parallel_executor.cc create mode 100644 paddle/fluid/pybind/parallel_executor.h create mode 100644 paddle/fluid/pybind/place.cc create mode 100644 paddle/fluid/pybind/place.h create mode 100644 paddle/fluid/pybind/tensor.cc create mode 100644 paddle/fluid/pybind/tensor.h diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt index fbd552ef00..1f2b30853c 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt @@ -6,7 +6,7 @@ cc_library( if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library( final_dygraph_node - SRCS nodes.cc - DEPS ${eager_deps} ${eager_manual_nodes}) + SRCS nodes.cc ${eager_manual_nodes} + DEPS ${eager_deps}) add_dependencies(final_dygraph_node eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt index 66053baa58..9baf8956fe 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt @@ -6,7 +6,7 @@ cc_library( if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library( final_dygraph_function - SRCS dygraph_functions.cc - DEPS ${eager_deps} ${eager_manual_functions}) + SRCS dygraph_functions.cc ${eager_manual_functions} + DEPS ${eager_deps}) add_dependencies(final_dygraph_function eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt index d71f1153e2..d25b3ba08b 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt @@ -1,17 +1,4 @@ -cc_library( - add_n_fwd_func - SRCS add_n_fwd_func.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(add_n_fwd_func eager_codegen) - -cc_library( - conv2d_fwd_function - SRCS conv2d_fwd_function.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(conv2d_fwd_function eager_codegen) - set(eager_manual_functions - conv2d_fwd_function add_n_fwd_func + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt index fa6a9a53ab..ac5ce176f4 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt @@ -1,13 +1,4 @@ -cc_library( - add_n_node - SRCS add_n_node.cc - DEPS ${eager_deps} ${fluid_deps}) - -cc_library( - conv2d_nodes - SRCS conv2d_nodes.cc - DEPS ${eager_deps} ${fluid_deps}) - set(eager_manual_nodes - conv2d_nodes add_n_node + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt index 295b8d9a64..5c47b0870a 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt @@ -1,28 +1,5 @@ -cc_library( - fused_gate_attention_fwd_func - SRCS fused_gate_attention_fwd_func.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(fused_gate_attention_fwd_func eager_codegen - copy_dygraph_forward_functions) - -cc_library( - fused_feedforward_fwd_func - SRCS fused_feedforward_fwd_func.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(fused_feedforward_fwd_func eager_codegen - copy_dygraph_forward_functions) - -cc_library( - fused_attention_fwd_func - SRCS fused_attention_fwd_func.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(fused_attention_fwd_func eager_codegen - copy_dygraph_forward_functions) - set(fluid_manual_functions - fused_gate_attention_fwd_func fused_feedforward_fwd_func - fused_attention_fwd_func + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt index 28c034e8b5..101ed5d589 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt @@ -1,18 +1,5 @@ -cc_library( - fused_gate_attention_node - SRCS fused_gate_attention_node.cc - DEPS ${eager_deps} ${fluid_deps}) - -cc_library( - fused_feedforward_node - SRCS fused_feedforward_node.cc - DEPS ${eager_deps} ${fluid_deps}) - -cc_library( - fused_attention_node - SRCS fused_attention_node.cc - DEPS ${eager_deps} ${fluid_deps}) - set(fluid_manual_nodes - fused_gate_attention_node fused_feedforward_node fused_attention_node + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 4f5efe74fa..54b40c72d0 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -3083,27 +3083,44 @@ static std::string ConvertCoreOpsInfosToString( return core_ops_returns_info_init_str; } -static std::string GenerateCoreOpsReturnsInfo() { +static std::string GenerateCoreOpsArgsInfo() { const char* Core_Ops_Returns_MAP_TEMPLATE = "std::unordered_map> " - "core_ops_args_info = { %s };\n" - "std::unordered_map> " - "core_ops_args_type_info = { %s };\n" - "std::unordered_map> " - "core_ops_returns_info = { %s };\n"; + "core_ops_args_info = { %s };\n"; std::string core_ops_args_info_init_str = ConvertCoreOpsInfosToString(core_ops_args_info); + + std::string core_ops_info_str = paddle::string::Sprintf( + Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_info_init_str); + + return core_ops_info_str; +} + +static std::string GenerateCoreOpsArgsTypeInfo() { + const char* Core_Ops_Returns_MAP_TEMPLATE = + "std::unordered_map> " + "core_ops_args_type_info = { %s };\n"; + std::string core_ops_args_type_info_init_str = ConvertCoreOpsInfosToString(core_ops_args_type_info); + + std::string core_ops_info_str = paddle::string::Sprintf( + Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_type_info_init_str); + + return core_ops_info_str; +} + +static std::string GenerateCoreOpsReturnsInfo() { + const char* Core_Ops_Returns_MAP_TEMPLATE = + "std::unordered_map> " + "core_ops_returns_info = { %s };\n"; + std::string core_ops_returns_info_init_str = ConvertCoreOpsInfosToString(core_ops_returns_info); - std::string core_ops_info_str = - paddle::string::Sprintf(Core_Ops_Returns_MAP_TEMPLATE, - core_ops_args_info_init_str, - core_ops_args_type_info_init_str, - core_ops_returns_info_init_str); + std::string core_ops_info_str = paddle::string::Sprintf( + Core_Ops_Returns_MAP_TEMPLATE, core_ops_returns_info_init_str); return core_ops_info_str; } @@ -3252,6 +3269,12 @@ static void DygraphCodeGeneration(const std::string& output_dir, GenerateForwardDygraphFile( output_dir + "/forwards/dygraph_forward_functions_args_info.tmp.cc", + GenerateCoreOpsArgsInfo()); + GenerateForwardDygraphFile( + output_dir + "/forwards/dygraph_forward_functions_args_type_info.tmp.cc", + GenerateCoreOpsArgsTypeInfo()); + GenerateForwardDygraphFile( + output_dir + "/forwards/dygraph_forward_functions_returns_info.tmp.cc", GenerateCoreOpsReturnsInfo()); VLOG(6) << "-------- GenerateNodeCCFile -------"; diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py index d6574bc2e8..9fbf1ed6cd 100644 --- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py +++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py @@ -96,6 +96,11 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count): "nodes" + str(i + 1) + ".cc")) empty_files.append( os.path.join(forwards_dir, "dygraph_forward_functions_args_info.cc")) + empty_files.append( + os.path.join(forwards_dir, + "dygraph_forward_functions_args_type_info.cc")) + empty_files.append( + os.path.join(forwards_dir, "dygraph_forward_functions_returns_info.cc")) for path in empty_files: if not os.path.exists(path): open(path, 'a').close() @@ -125,7 +130,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count): f.write("cc_library(dygraph_node SRCS ") for i in range(split_count): f.write("nodes" + str(i + 1) + ".cc ") - f.write("DEPS ${eager_deps} ${fluid_deps} ${fluid_manual_nodes})\n") + f.write("${fluid_manual_nodes} DEPS ${eager_deps} ${fluid_deps})\n") f.write("add_dependencies(dygraph_node copy_dygraph_node)") with open(forwards_level_cmakelist_path, "w") as f: @@ -143,6 +148,12 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count): f.write( " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.cc\"\n" ) + f.write( + " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.cc\"\n" + ) + f.write( + " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.cc\"\n" + ) f.write(" DEPENDS eager_codegen\n") f.write(" VERBATIM)\n") @@ -150,8 +161,10 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count): for i in range(split_count): f.write("dygraph_forward_functions" + str(i + 1) + ".cc ") f.write("dygraph_forward_functions_args_info.cc ") + f.write("dygraph_forward_functions_args_type_info.cc ") + f.write("dygraph_forward_functions_returns_info.cc ") f.write( - "DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${fluid_manual_functions})\n" + "${fluid_manual_functions} DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n" ) f.write( "add_dependencies(dygraph_function copy_dygraph_forward_functions)") diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index cf10734d1d..006e98f175 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -1,4 +1,16 @@ -set(INTERPRETERCORE_DEPS +add_subdirectory(workqueue) +add_subdirectory(garbage_collector) + +set(STANDALONE_EXECUTOR_SRCS + data_transfer.cc + new_executor_defs.cc + interpretercore_util.cc + event_manager.cc + stream_analyzer.cc + interpretercore.cc + standalone_executor.cc) + +set(STANDALONE_EXECUTOR_DEPS op_registry device_context scope @@ -20,62 +32,33 @@ set(INTERPRETERCORE_DEPS variable_helper timer monitor - nan_inf_utils) - -add_subdirectory(workqueue) -add_subdirectory(garbage_collector) - -cc_library( - data_transfer - SRCS data_transfer.cc - DEPS enforce scope glog) -cc_library( - new_executor_defs - SRCS new_executor_defs.cc - DEPS enforce glog scope) -cc_library( - interpretercore_util - SRCS interpretercore_util.cc - DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer) -cc_library( - event_manager - SRCS event_manager.cc - DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs) -cc_library( - stream_analyzer - SRCS stream_analyzer.cc - DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs) + nan_inf_utils + enforce + scope + glog + enforce + glog + scope + workqueue + interpretercore_event_garbage_collector + ${DEVICE_EVENT_LIBS} + glog) if(WITH_GPU OR WITH_ROCM) - cc_library( - interpretercore - SRCS interpretercore.cc - DEPS workqueue - ${DEVICE_EVENT_LIBS} - interpretercore_util - interpretercore_event_garbage_collector - interpretercore_fast_garbage_collector - stream_analyzer - event_manager) -else() - cc_library( - interpretercore - SRCS interpretercore.cc - DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util - interpretercore_event_garbage_collector stream_analyzer event_manager) + set(STANDALONE_EXECUTOR_DEPS ${STANDALONE_EXECUTOR_DEPS} + interpretercore_fast_garbage_collector) endif() cc_library( standalone_executor - SRCS standalone_executor.cc - DEPS interpretercore) + SRCS ${STANDALONE_EXECUTOR_SRCS} + DEPS ${STANDALONE_EXECUTOR_DEPS}) cc_library( staticgraph_executor_statistics SRCS executor_statistics.cc DEPS enforce glog os_info) -# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) # skip win32 since wget is not installed by default on windows machine. if(WITH_GPU AND WITH_TESTING @@ -120,13 +103,7 @@ if(WITH_GPU cc_test( standalone_executor_test SRCS standalone_executor_test.cc - DEPS interpretercore - standalone_executor - operator - op_registry - executor - ${OPS} - ${OP_DEPS}) + DEPS standalone_executor operator op_registry executor ${OPS} ${OP_DEPS}) set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100) add_dependencies(standalone_executor_test download_program) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 2d4a57b82a..98ece2db96 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -5,7 +5,7 @@ cc_library( cc_library( var_helper SRCS var_helper.cc - DEPS tensor phi_api) + DEPS tensor selected_rows) if(WITH_XPU) cc_library( prepared_operator @@ -20,8 +20,8 @@ if(WITH_XPU) op_kernel_type data_transform nan_inf_utils - phi_api - phi_utils + scalar + int_array var_helper profiler) else() @@ -37,21 +37,16 @@ else() op_kernel_type data_transform nan_inf_utils - phi_api - phi_utils + scalar + int_array var_helper profiler) endif() cc_library( layer SRCS layer.cc - DEPS prepared_operator - math_function - imperative_flag - variable_helper - op_registry - var_helper - phi_api) + DEPS prepared_operator math_function imperative_flag variable_helper + op_registry var_helper) add_subdirectory(jit) if(WITH_GPU) cc_library( diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 7fb00504ee..809ad5174b 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -101,7 +101,7 @@ else() cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor) endif() -set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel) +set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta) register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt index da9121550e..15c7a6c462 100644 --- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt @@ -10,4 +10,4 @@ nv_library( nv_test( cudnn_helper_test SRCS cudnn_helper_test.cc - DEPS dynload_cuda phi) + DEPS dynload_cuda) diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore index bd45f1ec2e..a6f20e2180 100644 --- a/paddle/fluid/pybind/.gitignore +++ b/paddle/fluid/pybind/.gitignore @@ -1,4 +1,11 @@ pybind.h -op_function.cc +op_function1.cc +op_function2.cc +op_function3.cc +op_function4.cc +op_function5.cc +op_function6.cc +op_function7.cc +op_function8.cc eager_op_function.cc eager_final_state_op_function.cc diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index d5c7bcc30d..f301189d77 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -102,13 +102,16 @@ endif() set(PYBIND_SRCS pybind.cc imperative.cc - op_function.cc inference_api.cc ir.cc bind_fleet_executor.cc reader_py.cc protobuf.cc exception.cc + op_function_common.cc + parallel_executor.cc + tensor.cc + place.cc const_value.cc global_value_getter_setter.cc fleet_wrapper_py.cc @@ -124,13 +127,15 @@ set(PYBIND_SRCS generator_py.cc communication.cc cuda_streams_py.cc - jit.cc) - -execute_process( - COMMAND - "${PYTHON_EXECUTABLE}" - "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py" - "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/") + jit.cc + op_function1.cc + op_function2.cc + op_function3.cc + op_function4.cc + op_function5.cc + op_function6.cc + op_function7.cc + op_function8.cc) if(WITH_CUSTOM_DEVICE) set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi) @@ -267,12 +272,35 @@ if(WITH_PYTHON) target_link_libraries(kernel_signature_generator ${ROCM_HIPRTC_LIB}) endif() - set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function.cc) - set(tmp_impl_file ${impl_file}.tmp) + set(op_function_output_path ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/) + set(impl_file1 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function1.cc) + set(tmp_impl_file1 ${impl_file1}.tmp) + set(impl_file2 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function2.cc) + set(tmp_impl_file2 ${impl_file2}.tmp) + set(impl_file3 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function3.cc) + set(tmp_impl_file3 ${impl_file3}.tmp) + set(impl_file4 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function4.cc) + set(tmp_impl_file4 ${impl_file4}.tmp) + set(impl_file5 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function5.cc) + set(tmp_impl_file5 ${impl_file5}.tmp) + set(impl_file6 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function6.cc) + set(tmp_impl_file6 ${impl_file6}.tmp) + set(impl_file7 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function7.cc) + set(tmp_impl_file7 ${impl_file7}.tmp) + set(impl_file8 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function8.cc) + set(tmp_impl_file8 ${impl_file8}.tmp) + set(CODE_GEN_SPLIT_FILE_COUNT "8") set(eager_impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function.cc) set(tmp_eager_impl_file ${eager_impl_file}.tmp) + execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py" + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/" + "${CODE_GEN_SPLIT_FILE_COUNT}") + set(OP_IMPL_DEPS op_function_generator) set(EAGER_OP_IMPL_DEPS eager_op_function_generator eager_final_state_python_c_codegen) @@ -292,7 +320,7 @@ if(WITH_PYTHON) ":retry\n" "ECHO op_function_generator run %build_times% time\n" "taskkill /f /im op_function_generator.exe 2>NUL\n" - "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n" + "${op_impl_path}/op_function_generator.exe ${op_function_output_path} ${CODE_GEN_SPLIT_FILE_COUNT}\n" "if %ERRORLEVEL% NEQ 0 (\n" " set /a build_times=%build_times%+1\n" " if %build_times% GEQ 10 (\n" @@ -367,12 +395,33 @@ if(WITH_PYTHON) endif() add_custom_command( - OUTPUT ${impl_file} + OUTPUT op_function COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} - ${impl_file} - COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1} + ${impl_file1} + COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2} + ${impl_file2} + COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3} + ${impl_file3} + COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4} + ${impl_file4} + COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5} + ${impl_file5} + COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6} + ${impl_file6} + COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7} + ${impl_file7} + COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8} + ${impl_file8} + COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}" DEPENDS ${OP_IMPL_DEPS}) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_custom_command( @@ -431,13 +480,35 @@ if(WITH_PYTHON) list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0) endif() add_custom_command( - OUTPUT ${impl_file} + OUTPUT op_function COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." - "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" "${tmp_impl_file}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} - ${impl_file} - COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" + "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" + "${op_function_output_path}" "${CODE_GEN_SPLIT_FILE_COUNT}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1} + ${impl_file1} + COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2} + ${impl_file2} + COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3} + ${impl_file3} + COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4} + ${impl_file4} + COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5} + ${impl_file5} + COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6} + ${impl_file6} + COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7} + ${impl_file7} + COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8} + ${impl_file8} + COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}" DEPENDS ${OP_IMPL_DEPS} VERBATIM) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) @@ -454,19 +525,13 @@ if(WITH_PYTHON) VERBATIM) endif() endif() - add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file}) + add_custom_target(op_function_generator_cmd ALL DEPENDS op_function) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file}) endif() - list(APPEND PYBIND_DEPS interpretercore standalone_executor - staticgraph_executor_statistics) - cc_library( - op_function_common - SRCS op_function_common.cc - DEPS ${PYBIND_DEPS}) - list(APPEND PYBIND_DEPS op_function_common) + list(APPEND PYBIND_DEPS standalone_executor staticgraph_executor_statistics) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) set(PYBIND_SRCS eager.cc ${PYBIND_SRCS}) @@ -482,7 +547,6 @@ if(WITH_PYTHON) list(APPEND PYBIND_DEPS backward) list(APPEND PYBIND_DEPS grad_node_info) list(APPEND PYBIND_DEPS phi) - list(APPEND PYBIND_DEPS op_function_common) list(APPEND PYBIND_DEPS final_dygraph_function) list(APPEND PYBIND_DEPS final_dygraph_node) list(APPEND PYBIND_DEPS dygraph_function) diff --git a/paddle/fluid/pybind/generate_file_structures.py b/paddle/fluid/pybind/generate_file_structures.py index 391c47b8ee..bc61ecdcc9 100644 --- a/paddle/fluid/pybind/generate_file_structures.py +++ b/paddle/fluid/pybind/generate_file_structures.py @@ -16,12 +16,16 @@ import sys import os if __name__ == "__main__": - assert len(sys.argv) == 2 + assert len(sys.argv) == 3 pybind_dir = sys.argv[1] + split_count = int(sys.argv[2]) empty_files = [os.path.join(pybind_dir, "eager_final_state_op_function.cc")] empty_files.append(os.path.join(pybind_dir, "eager_op_function.cc")) - empty_files.append(os.path.join(pybind_dir, "op_function.cc")) + + for i in range(split_count): + empty_files.append( + os.path.join(pybind_dir, "op_function" + str(i + 1) + ".cc")) for path in empty_files: if not os.path.exists(path): diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 569890fa25..8a21271db4 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -64,6 +64,7 @@ limitations under the License. */ namespace paddle { namespace pybind { +std::atomic VarBaseUniqueNameID{0}; PyTypeObject *g_varbase_pytype = nullptr; namespace py = ::pybind11; @@ -497,7 +498,14 @@ static void VarBaseCopy(std::shared_ptr &src, // NOLINT void BindImperative(py::module *m_ptr) { auto &m = *m_ptr; - BindOpFunctions(&m); + BindOpFunctions1(&m); + BindOpFunctions2(&m); + BindOpFunctions3(&m); + BindOpFunctions4(&m); + BindOpFunctions5(&m); + BindOpFunctions6(&m); + BindOpFunctions7(&m); + BindOpFunctions8(&m); #ifndef _WIN32 // Dygraph DataLoader signal handler diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h index 5038dd5e6c..884136ec0d 100644 --- a/paddle/fluid/pybind/op_function.h +++ b/paddle/fluid/pybind/op_function.h @@ -257,7 +257,14 @@ PyObject* MakeReturnPyObject(const std::tuple& out) { return result; } -void BindOpFunctions(pybind11::module* module); +void BindOpFunctions1(pybind11::module* module); +void BindOpFunctions2(pybind11::module* module); +void BindOpFunctions3(pybind11::module* module); +void BindOpFunctions4(pybind11::module* module); +void BindOpFunctions5(pybind11::module* module); +void BindOpFunctions6(pybind11::module* module); +void BindOpFunctions7(pybind11::module* module); +void BindOpFunctions8(pybind11::module* module); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 9ddf0e7083..7eeadac7ce 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -422,13 +422,17 @@ std::string GenerateOpFunctionsBody( return op_function_str; } -static std::tuple, std::vector> -GenerateOpFunctions() { +static std::vector< + std::tuple, std::vector>> +GenerateOpFunctions(int split_count) { auto& op_info_map = paddle::framework::OpInfoMap::Instance().map(); - + std::vector, std::vector>> + result; std::vector op_function_list, bind_function_list; auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels(); + paddle::flat_hash_map + op_info_map_need_gen; for (auto& pair : op_info_map) { auto& op_info = pair.second; auto op_proto = op_info.proto_; @@ -444,6 +448,22 @@ GenerateOpFunctions() { continue; } + op_info_map_need_gen.emplace(pair); + } + + int cc_file_api_size = op_info_map_need_gen.size() / split_count; + if (op_info_map_need_gen.size() % split_count != 0) { + cc_file_api_size++; + } + int api_index = 0; + int file_index = 0; + + for (auto& pair : op_info_map_need_gen) { + auto& op_info = pair.second; + auto op_proto = op_info.proto_; + + auto& op_type = op_proto->type(); + // NOTE(pangyoki): Inplace Strategy. // In this case, output will reuse input varbase. // Dygraph mode needs to be aligned with the in-place strategy in static @@ -489,13 +509,24 @@ GenerateOpFunctions() { op_function_list.emplace_back(std::move(inplace_op_function_str)); bind_function_list.emplace_back(std::move(inplace_bind_function_str)); } + + api_index++; + if (api_index / cc_file_api_size > file_index) { + file_index++; + result.push_back(std::make_tuple(op_function_list, bind_function_list)); + op_function_list.clear(); + bind_function_list.clear(); + } } - return std::make_tuple(op_function_list, bind_function_list); + + result.push_back(std::make_tuple(op_function_list, bind_function_list)); + + return result; } int main(int argc, char* argv[]) { - if (argc != 2) { - std::cerr << "argc must be 2" << std::endl; + if (argc != 3) { + std::cerr << "argc must be 3" << std::endl; return -1; } @@ -513,39 +544,45 @@ int main(int argc, char* argv[]) { "\"paddle/fluid/pybind/op_function.h\"", ""}; - std::ofstream out(argv[1], std::ios::out); + std::string path = argv[1]; + int split_count = atoi(argv[2]); - for (auto& header : headers) { - out << "#include " + header + "\n"; - } + auto op_funcs = GenerateOpFunctions(split_count); - out << "\n\n"; - - auto op_funcs = GenerateOpFunctions(); - - out << "namespace paddle {\n" - << "namespace pybind {\n\n"; - out << "std::atomic VarBaseUniqueNameID{0};\n"; - out << paddle::string::join_strings(std::get<0>(op_funcs), '\n'); - out << "\n\n"; - - out << "static PyMethodDef ExtestMethods[] = {\n" - << paddle::string::join_strings(std::get<1>(op_funcs), '\n') - << "\n {nullptr,nullptr,0,nullptr}" - << "};\n\n"; - - out << "void BindOpFunctions(pybind11::module *module) {\n" - << " auto m = module->def_submodule(\"ops\");\n" - << " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n" - << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " - "core.ops failed!\"));\n" - << " }\n\n" - << " InitOpsAttrTypeMap();" - << "}\n\n" - << "} // namespace pybind\n" - << "} // namespace paddle\n"; - - out.close(); + for (size_t i = 0; i < op_funcs.size(); i++) { + std::ofstream out(path + "op_function" + std::to_string(i + 1) + ".cc.tmp", + std::ios::out); + + for (auto& header : headers) { + out << "#include " + header + "\n"; + } + + out << "\n\n"; + + out << "namespace paddle {\n" + << "namespace pybind {\n\n"; + out << "extern std::atomic VarBaseUniqueNameID;\n"; + out << paddle::string::join_strings(std::get<0>(op_funcs[i]), '\n'); + out << "\n\n"; + + out << "static PyMethodDef ExtestMethods[] = {\n" + << paddle::string::join_strings(std::get<1>(op_funcs[i]), '\n') + << "\n {nullptr,nullptr,0,nullptr}" + << "};\n\n"; + + out << "void BindOpFunctions" << i + 1 << "(pybind11::module *module) {\n" + << " auto m = module->def_submodule(\"ops\");\n" + << " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n" + << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " + "core.ops failed!\"));\n" + << " }\n\n" + << " InitOpsAttrTypeMap();" + << "}\n\n" + << "} // namespace pybind\n" + << "} // namespace paddle\n"; + + out.close(); + } #ifdef PADDLE_WITH_ASCEND_CL ge::GEFinalize(); diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc new file mode 100644 index 0000000000..f1d2f456a2 --- /dev/null +++ b/paddle/fluid/pybind/parallel_executor.cc @@ -0,0 +1,1118 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include +#include +#include +#include +#include +#include +#include // NOLINT // for call_once +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_cache.h" +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" +#include "paddle/fluid/framework/ir/cost_model.h" +#include "paddle/fluid/framework/ir/generate_pass.h" +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/new_executor/executor_statistics.h" +#include "paddle/fluid/framework/new_executor/standalone_executor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/phi_utils.h" +#include "paddle/fluid/framework/prune.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/save_load_util.h" +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/selected_rows_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/trainer.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/memory/allocation/allocator_strategy.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#endif +#include "paddle/fluid/memory/allocation/mmap_allocator.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/profiler.h" +#include "paddle/fluid/pybind/cuda_streams_py.h" +#include "paddle/fluid/pybind/distributed_py.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/io.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/lod_utils.h" +#include "paddle/utils/none.h" +#ifdef PADDLE_WITH_ASCEND +#include "paddle/fluid/pybind/ascend_wrapper_py.h" +#endif +#include "paddle/fluid/pybind/bind_cost_model.h" +#include "paddle/fluid/pybind/bind_fleet_executor.h" +#include "paddle/fluid/pybind/box_helper_py.h" +#include "paddle/fluid/pybind/communication.h" +#include "paddle/fluid/pybind/compatible.h" +#include "paddle/fluid/pybind/const_value.h" +#include "paddle/fluid/pybind/data_set_py.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/fleet_wrapper_py.h" +#include "paddle/fluid/pybind/generator_py.h" +#include "paddle/fluid/pybind/global_value_getter_setter.h" +#include "paddle/fluid/pybind/gloo_context_py.h" +#include "paddle/fluid/pybind/gloo_wrapper_py.h" +#include "paddle/fluid/pybind/heter_wrapper_py.h" +#include "paddle/fluid/pybind/inference_api.h" +#include "paddle/fluid/pybind/ir.h" +#include "paddle/fluid/pybind/metrics_py.h" +#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/phi/backends/device_manager.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/pybind/nccl_wrapper_py.h" +#endif +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/pybind/protobuf.h" +#include "paddle/fluid/pybind/pybind.h" // NOLINT +#include "paddle/fluid/pybind/reader_py.h" +#include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/fluid/string/to_string.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#ifndef PADDLE_WITH_HIP +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" +#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#endif + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" +#endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/capi/capi.h" +#endif + +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif + +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif + +#ifdef PADDLE_WITH_CRYPTO +#include "paddle/fluid/pybind/crypto.h" +#endif + +#if defined PADDLE_WITH_PSCORE +#include "paddle/fluid/pybind/fleet_py.h" +#endif + +#ifdef PADDLE_WITH_CINN +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#endif + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/imperative/layout_autotune.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/parallel_executor.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" +#include "pybind11/stl.h" + +DECLARE_bool(use_mkldnn); + +// disable auto conversion to list in Python +PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); + +namespace paddle { +namespace pybind { +using namespace paddle::framework; // NOLINT +void BindParallelExecutor(pybind11::module &m) { // NOLINT + // -- python binds for parallel executor. + py::class_ pe(m, "ParallelExecutor"); + py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( + ExecutionStrategy allows the user to more preciously control how to run + the program in ParallelExecutor by setting the property. + + Returns: + ExecutionStrategy: An ExecutionStrategy object. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + import paddle.nn.functional as F + + paddle.enable_static() + + x = static.data(name='x', shape=[None, 13], dtype='float32') + y = static.data(name='y', shape=[None, 1], dtype='float32') + y_predict = static.nn.fc(input=x, size=1, act=None) + + cost = F.square_error_cost(input=y_predict, label=y) + avg_loss = paddle.mean(cost) + + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_loss) + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_threads = 4 + + train_exe = static.ParallelExecutor(use_cuda=False, + loss_name=avg_loss.name, + exec_strategy=exec_strategy) + )DOC"); + + py::enum_(m, "DeviceType", py::arithmetic()) + .value("CPU", paddle::platform::DeviceType::CPU) + .value("CUDA", paddle::platform::DeviceType::CUDA) + .value("XPU", paddle::platform::DeviceType::XPU); + + exec_strategy.def(py::init()) + .def_property( + "num_threads", + [](const ExecutionStrategy &self) { return self.num_threads_; }, + [](ExecutionStrategy &self, size_t num_threads) { + self.num_threads_ = num_threads; + }, + R"DOC( + The type is INT, num_threads represents the size of thread pool that + used to run the operators of the current program in ParallelExecutor. + If :math:`num\_threads=1`, all the operators will execute one by one, + but the order maybe difference between iterations. + If it is not set, it will be set in ParallelExecutor according to the + device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, + :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. + if it is not set, ParallelExecutor will get the cpu count by calling + `multiprocessing.cpu_count()`. Default 0. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_threads = 4 + )DOC") + .def_property( + "_use_device", + [](const ExecutionStrategy &self) { return self.use_device_; }, + [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) { + self.use_device_ = use_device; + }) // NOTE(liuyuhui): Doesn't add doc for 'use_device', because + // use_device isn‘t exposed to users. + .def_property( + "allow_op_delay", + [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, + [](ExecutionStrategy &self, bool allow_op_delay) { + self.allow_op_delay_ = allow_op_delay; + }, + R"DOC(The type is BOOL, allow_op_delay represents whether to delay the + communication operators to run, it may make the execution faster. + Note that this option is invalid now, and it will be removed in + next version. Default False.)DOC") + .def_property( + "num_iteration_per_drop_scope", + [](const ExecutionStrategy &self) { + return self.num_iteration_per_drop_scope_; + }, + [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { + self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; + }, + R"DOC(The type is INT, num_iteration_per_drop_scope indicates how + many iterations to clean up the temp variables which + is generated during execution. It may make the execution faster, + because the temp variable's shape maybe the same between two iterations. + Default 100. + + .. note:: + 1. If you fetch data when calling the 'run', the ParallelExecutor + will clean up the temp variables at the end of the current iteration. + 2. In some NLP model, it may cause the GPU memory is insufficient, + in this case, you should reduce `num_iteration_per_drop_scope`. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_iteration_per_drop_scope = 10 + )DOC") + .def_property( + "num_iteration_per_run", + [](const ExecutionStrategy &self) { + return self.num_iteration_per_run_; + }, + [](ExecutionStrategy &self, size_t num_iteration_per_run) { + self.num_iteration_per_run_ = num_iteration_per_run; + }, + R"DOC(This config that how many iteration the executor will run when + user call exe.run() in python。Default: 1. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_iteration_per_run = 10 + )DOC") + .def_property( + "use_thread_barrier", + [](const ExecutionStrategy &self) { return self.thread_barrier_; }, + [](ExecutionStrategy &self, bool use_thread_barrier) { + self.thread_barrier_ = use_thread_barrier; + }, + R"DOC(This config that the this is distributed training with parameter server + )DOC") + .def_property( + "_dry_run", + [](const ExecutionStrategy &self) { return self.dry_run_; }, + [](ExecutionStrategy &self, bool dry_run) { + self.dry_run_ = dry_run; + }); + + exec_strategy.def_property( + "use_experimental_executor", + [](const ExecutionStrategy &self) { + return self.type_ == ExecutionStrategy::kExperimental; + }, + [](ExecutionStrategy &self, bool experimental) { + self.type_ = experimental ? ExecutionStrategy::kExperimental + : ExecutionStrategy::kDefault; + }); + + py::class_ build_strategy(pe, "BuildStrategy", R"DOC( + BuildStrategy allows the user to more preciously control how to + build the SSA Graph in ParallelExecutor by setting the property. + + Returns: + BuildStrategy: An BuildStrategy object. + + Examples: + .. code-block:: python + + import os + import paddle + import paddle.static as static + + paddle.enable_static() + + os.environ['CPU_NUM'] = str(2) + places = static.cpu_places() + + data = static.data(name="x", shape=[None, 1], dtype="float32") + hidden = static.nn.fc(input=data, size=10) + loss = paddle.mean(hidden) + paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) + + build_strategy = static.BuildStrategy() + build_strategy.enable_inplace = True + build_strategy.memory_optimize = True + build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce + program = static.CompiledProgram(static.default_main_program()) + program = program.with_data_parallel(loss_name=loss.name, + build_strategy=build_strategy, + places=places) +)DOC"); + + py::enum_(build_strategy, "ReduceStrategy") + .value("Reduce", BuildStrategy::ReduceStrategy::kReduce) + .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce) + .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce); + py::enum_(build_strategy, + "GradientScaleStrategy") + .value("CoeffNumDevice", + BuildStrategy::GradientScaleStrategy::kCoeffNumDevice) + .value("One", BuildStrategy::GradientScaleStrategy::kOne) + .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized); + + build_strategy.def(py::init()) + .def("_clear_finalized", &BuildStrategy::ClearFinalized) + .def_property( + "reduce_strategy", + [](const BuildStrategy &self) { return self.reduce_; }, + [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.reduce_ = strategy; + }, + R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce + strategies in ParallelExecutor, AllReduce and Reduce. If you want + that all the parameters' optimization are done on all devices independently, + you should choose AllReduce; otherwise, if you choose Reduce, all the parameters' + optimization will be evenly distributed to different devices, and then + broadcast the optimized parameter to other devices. + Default is 'AllReduce'. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce + )DOC") + .def_property( + "gradient_scale_strategy", + [](const BuildStrategy &self) { return self.gradient_scale_; }, + [](BuildStrategy &self, + BuildStrategy::GradientScaleStrategy strategy) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.gradient_scale_ = strategy; + }, + R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three + ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice, + One and Customized. By default, ParallelExecutor sets the :math:`loss@grad` + according to the number of devices. If you want to customize :math:`loss@grad`, + you can choose Customized. Default is 'CoeffNumDevice'. + + Examples: + .. code-block:: python + + import numpy + import os + import paddle + import paddle.static as static + + paddle.enable_static() + + use_cuda = True + place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + exe = static.Executor(place) + + # NOTE: If you use CPU to run the program, you need + # to specify the CPU_NUM, otherwise, paddle will use + # all the number of the logic core as the CPU_NUM, + # in that case, the batch size of the input should be + # greater than CPU_NUM, if not, the process will be + # failed by an exception. + if not use_cuda: + os.environ['CPU_NUM'] = str(2) + places = static.cpu_places() + else: + places = static.cuda_places() + + data = static.data(name='X', shape=[None, 1], dtype='float32') + hidden = static.nn.fc(input=data, size=10) + loss = paddle.mean(hidden) + paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) + + exe.run(static.default_startup_program()) + + build_strategy = static.BuildStrategy() + build_strategy.gradient_scale_strategy = \ + static.BuildStrategy.GradientScaleStrategy.Customized + compiled_prog = static.CompiledProgram( + static.default_main_program()).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy, + places=places) + + dev_count = len(places) + x = numpy.random.random(size=(10, 1)).astype('float32') + loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01 + loss_grad_name = loss.name+"@GRAD" + loss_data = exe.run(compiled_prog, + feed={"X": x, loss_grad_name : loss_grad}, + fetch_list=[loss.name, loss_grad_name]) + )DOC") + .def_property( + "debug_graphviz_path", + [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, + [](BuildStrategy &self, const std::string &path) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.debug_graphviz_path_ = path; + }, + R"DOC((str, optional): debug_graphviz_path indicates the path that + writing the SSA Graph to file in the form of graphviz. + It is useful for debugging. Default is empty string, that is, "" + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.debug_graphviz_path = "./graph" + )DOC") + .def_property( + "enable_sequential_execution", + [](const BuildStrategy &self) { + return self.enable_sequential_execution_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.enable_sequential_execution_ = b; + }, + R"DOC((bool, optional): If set True, the execution order of ops would + be the same as what is in the program. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.enable_sequential_execution = True + )DOC") + .def_property( + "remove_unnecessary_lock", + [](const BuildStrategy &self) { + return self.remove_unnecessary_lock_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.remove_unnecessary_lock_ = b; + }, + R"DOC((bool, optional): If set True, some locks in GPU ops would be + released and ParallelExecutor would run faster. Default is True. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.remove_unnecessary_lock = True + )DOC") + .def_property( + "num_trainers", + [](const BuildStrategy &self) { return self.num_trainers_; }, + [](BuildStrategy &self, int num_trainers) { +#ifdef WIN32 + PADDLE_THROW(platform::errors::Unavailable( + "Distribution mode is not supported on Windows platform.")); +#endif + self.num_trainers_ = num_trainers; + }) + .def_property( + "trainers_endpoints", + [](const BuildStrategy &self) { return self.trainers_endpoints_; }, + [](BuildStrategy &self, + const std::vector &trainers_endpoints) { + self.trainers_endpoints_ = trainers_endpoints; + }) + .def_property( + "trainer_id", + [](const BuildStrategy &self) { return self.trainer_id_; }, + [](BuildStrategy &self, int trainer_id) { + self.trainer_id_ = trainer_id; + }) + .def_property( + "nccl_comm_num", + [](const BuildStrategy &self) { return self.nccl_comm_num_; }, + [](BuildStrategy &self, int nccl_comm_num) { + self.nccl_comm_num_ = nccl_comm_num; + }) + .def_property( + "bkcl_comm_num", + [](const BuildStrategy &self) { return self.bkcl_comm_num_; }, + [](BuildStrategy &self, int bkcl_comm_num) { + self.bkcl_comm_num_ = bkcl_comm_num; + }) + .def_property( + "use_hierarchical_allreduce", + [](const BuildStrategy &self) { + return self.use_hierarchical_allreduce_; + }, + [](BuildStrategy &self, bool use) { + self.use_hierarchical_allreduce_ = use; + }) + .def_property( + "hierarchical_allreduce_inter_nranks", + [](const BuildStrategy &self) { + return self.hierarchical_allreduce_inter_nranks_; + }, + [](BuildStrategy &self, int nranks) { + self.hierarchical_allreduce_inter_nranks_ = nranks; + }) + + .def_property( + "fuse_elewise_add_act_ops", + [](const BuildStrategy &self) { + return self.fuse_elewise_add_act_ops_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_elewise_add_act_ops_ = b; + }, + R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether + to fuse elementwise_add_op and activation_op, + it may make the execution faster. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_elewise_add_act_ops = True + )DOC") + .def_property( + "fuse_gemm_epilogue", + [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_gemm_epilogue_ = b; + }, + R"DOC((bool, optional): fuse_gemm_epilogue indicate whether + to fuse matmul_op, elemenewist_add_op and activation_op, + it may make the execution faster. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_gemm_epilogue = True + )DOC") + .def_property( + "fuse_bn_act_ops", + [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_bn_act_ops_ = b; + }, + R"DOC((bool, optional): fuse_bn_act_ops indicate whether + to fuse batch_norm and activation_op, + it may make the execution faster. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_bn_act_ops = True + )DOC") + .def_property( + "fuse_bn_add_act_ops", + [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_bn_add_act_ops_ = b; + }, + R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether + to fuse batch_norm, elementwise_add and activation_op, + it may make the execution faster. Default is True + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_bn_add_act_ops = True + )DOC") + .def_property( + "enable_auto_fusion", + [](const BuildStrategy &self) { return self.enable_auto_fusion_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.enable_auto_fusion_ = b; + }, + R"DOC((bool, optional): Whether to enable fusing subgraph to a + fusion_group. Now we only support fusing subgraph that composed + of elementwise-like operators, such as elementwise_add/mul + without broadcast and activations. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.enable_auto_fusion = True + )DOC") + .def_property( + "fuse_relu_depthwise_conv", + [](const BuildStrategy &self) { + return self.fuse_relu_depthwise_conv_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_relu_depthwise_conv_ = b; + }, + R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether + to fuse relu and depthwise_conv2d, + it will save GPU memory and may make the execution faster. + This options is only available in GPU devices. + Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_relu_depthwise_conv = True + )DOC") + .def_property( + "fuse_broadcast_ops", + [](const BuildStrategy &self) { + return self.fuse_broadcast_ops_ == true || + self.fuse_broadcast_ops_ == paddle::none; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, " + "cannot be configured again.")); + self.fuse_broadcast_ops_ = b; + }, + R"DOC((bool, optional): fuse_broadcast_op indicates whether + to fuse the broadcast ops. Note that, in Reduce mode, + fusing broadcast ops may make the program faster. Because + fusing broadcast OP equals delaying the execution of all + broadcast Ops, in this case, all nccl streams are used only + for NCCLReduce operations for a period of time. Default False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_broadcast_ops = True + )DOC") + .def_property( + "fuse_all_optimizer_ops", + [](const BuildStrategy &self) { + return self.fuse_all_optimizer_ops_ == true || + self.fuse_all_optimizer_ops_ == paddle::none; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, " + "cannot be configured again.")); + self.fuse_all_optimizer_ops_ = b; + }) + .def_property( + "sync_batch_norm", + [](const BuildStrategy &self) { return self.sync_batch_norm_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.sync_batch_norm_ = b; + }, + R"DOC((bool, optional): sync_batch_norm indicates whether to use + synchronous batch normalization which synchronizes the mean + and variance through multi-devices in training phase. + Current implementation doesn't support FP16 training and CPU. + And only synchronous on one machine, not all machines. + Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.sync_batch_norm = True + )DOC") + .def_property( + "memory_optimize", + [](const BuildStrategy &self) -> py::object { + if (self.memory_optimize_) { + return py::cast(self.memory_optimize_.get()); + } else { + return py::cast(nullptr); + } + }, + [](BuildStrategy &self, const py::handle &value) { + auto *py_obj = value.ptr(); + if (py_obj == nullptr || py_obj == Py_None) { + self.memory_optimize_ = paddle::none; + } else if (PyBool_Check(py_obj)) { + self.memory_optimize_ = (py_obj == Py_True); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "BuildStrategy.memory_optimize must be set to None, False " + "or True")); + } + }, + R"DOC((bool, optional): memory opitimize aims to save total memory + consumption, set to True to enable it. + + Default None. None means framework would choose to use or not use + this strategy automatically. Currently, None means that it is + enabled when GC is disabled, and disabled when GC is enabled. + True means enabling and False means disabling. Default is None. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.memory_optimize = True + + )DOC") + .def_property( + "is_distribution", + [](const BuildStrategy &self) { return self.is_distribution_; }, + [](BuildStrategy &self, bool b) { +#ifdef WIN32 + if (b) { + PADDLE_THROW(platform::errors::Unavailable( + "Distribution mode is not supported on Windows platform.")); + } +#else + self.is_distribution_ = b; +#endif + }) + .def_property( + "async_mode", + [](const BuildStrategy &self) { return self.async_mode_; }, + [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) + .def_property( + "enable_inplace", + [](const BuildStrategy &self) { return self.enable_inplace_; }, + [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) + .def_property( + "enable_addto", + [](const BuildStrategy &self) { return self.enable_addto_; }, + [](BuildStrategy &self, bool b) { self.enable_addto_ = b; }) + .def_property( + "fuse_all_reduce_ops", + [](const BuildStrategy &self) { + return self.fuse_all_reduce_ops_ == true || + self.fuse_all_reduce_ops_ == paddle::none; + }, + [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) + .def_property( + "enable_backward_optimizer_op_deps", + [](const BuildStrategy &self) { + return self.enable_backward_optimizer_op_deps_; + }, + [](BuildStrategy &self, bool b) { + self.enable_backward_optimizer_op_deps_ = b; + }) + .def_property( + "cache_runtime_context", + [](const BuildStrategy &self) { return self.cache_runtime_context_; }, + [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; }) + .def_property( + "mkldnn_enabled_op_types", + [](const BuildStrategy &self) { + return self.mkldnn_enabled_op_types_; + }, + [](BuildStrategy &self, + const std::unordered_set &mkldnn_enabled_op_types) { + self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types; + }) + .def_property( + "fix_op_run_order", + [](const BuildStrategy &self) { return self.fix_op_run_order_; }, + [](BuildStrategy &self, bool fix_op_run_order) { + self.fix_op_run_order_ = fix_op_run_order; + }) + .def_property( + "allow_cuda_graph_capture", + [](const BuildStrategy &self) { + return self.allow_cuda_graph_capture_; + }, + [](BuildStrategy &self, bool allow_cuda_graph_capture) { + self.allow_cuda_graph_capture_ = allow_cuda_graph_capture; + }) + .def("_copy", + [](const BuildStrategy &self) { + auto new_bs = self; + new_bs.ClearFinalized(); + return new_bs; + }) + .def( + "_finalize_strategy_and_create_passes", + [](BuildStrategy &self) -> std::shared_ptr { + return self.CreatePassesFromStrategy(true); + }, + R"DOC(Allow user to customized passes. Normally model-specific + optimization passes should be defined in this way. BuildStrategy + cannot be updated after being finalized.)DOC"); + + m.def("_set_cached_executor_build_strategy", + [](int64_t program_id, const BuildStrategy &build_strategy) { + auto &cached_exe_info = framework::ExecutorInfoCache::Instance(); + cached_exe_info.SetBuildStrategy(program_id, build_strategy); + }); + + pe.def(py::init &, + const std::vector &, + const std::string &, + Scope *, + std::vector &, + const ExecutionStrategy &, + const BuildStrategy &, + ir::Graph *>()) + // NOTE: even we return a vec* to Python use reference policy. + // We still cannot get local_scope from this vector, since the element + // of vec will be freed by Python GC. We can only return Scope* + // one by one and mark them as reference. + .def( + "local_scopes", + [](ParallelExecutor &self) -> std::vector * { + return &self.GetLocalScopes(); + }, + py::return_value_policy::reference) + .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes) + .def("_need_create_local_exe_scopes", + &ParallelExecutor::NeedCreateLocalExeScope) + .def("feed_tensors_into_local_scopes", + &ParallelExecutor::FeedTensorsIntoLocalScopes) + .def("feed_and_split_tensor_into_local_scopes", + &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes) + .def("run", + [](ParallelExecutor &self, + const std::vector &fetch_tensors, + bool return_merged) -> py::object { + if (return_merged) { + paddle::framework::FetchList ret; + /*gil_scoped_release*/ { + pybind11::gil_scoped_release release; + ret = self.RunAndMerge(fetch_tensors); + } + return py::cast(std::move(ret)); + } else { + paddle::framework::FetchUnmergedList ret; + /*gil_scoped_release*/ { + pybind11::gil_scoped_release release; + ret = self.Run(fetch_tensors); + } + return py::cast(std::move(ret)); + } + }) + .def("device_count", &ParallelExecutor::DeviceCount); + using VarQuantScale = + std::unordered_map>; + py::class_> pass(m, "Pass"); + pass.def(py::init()) + .def("has", &ir::Pass::Has) + .def("set_not_owned", + [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) { + self.SetNotOwned(attr_name, &attr); + }) + .def( + "set", + [](ir::Pass &self, const std::string &name, const std::string &attr) { + self.Set(name, new std::string(attr)); + }) + .def("set", + [](ir::Pass &self, const std::string &name, bool val) { + self.Set(name, new bool(val)); + }) + .def("set", + [](ir::Pass &self, const std::string &name, int val) { + self.Set(name, new int(val)); + }) + .def("set", + [](ir::Pass &self, + const std::string &name, + std::vector set) { + self.Set(name, new std::vector(set)); + }) + .def("set", + [](ir::Pass &self, + const std::string &name, + std::unordered_set set) { + self.Set(name, new std::unordered_set(set)); + }) + .def("set", + [](ir::Pass &self, + const std::string &name, + std::unordered_set set) { + self.Set(name, new std::unordered_set(set)); + }) + .def("set", + [](ir::Pass &self, const std::string &name, VarQuantScale scales) { + self.Set(name, new VarQuantScale(scales)); + }) + .def("type", &ir::Pass::Type) + .def("apply", [](ir::Pass &self, std::shared_ptr graph) { + self.Apply(graph.get()); + }); + + py::class_> pb( + m, "PassBuilder"); + pb.def(py::init()) + .def("append_pass", + [](ir::PassBuilder &self, + const std::string &pass_type) -> std::shared_ptr { + return self.AppendPass(pass_type); + }) + .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); }) + .def("insert_pass", + [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) { + return self.InsertPass(idx, pass_type); + }) + .def("remove_pass", + [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/parallel_executor.h b/paddle/fluid/pybind/parallel_executor.h new file mode 100644 index 0000000000..3c3acace03 --- /dev/null +++ b/paddle/fluid/pybind/parallel_executor.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" + +namespace paddle { +namespace pybind { + +void BindParallelExecutor(pybind11::module& m); // NOLINT + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc new file mode 100644 index 0000000000..84dca60c21 --- /dev/null +++ b/paddle/fluid/pybind/place.cc @@ -0,0 +1,816 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include +#include +#include +#include +#include +#include +#include // NOLINT // for call_once +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_cache.h" +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" +#include "paddle/fluid/framework/ir/cost_model.h" +#include "paddle/fluid/framework/ir/generate_pass.h" +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/new_executor/executor_statistics.h" +#include "paddle/fluid/framework/new_executor/standalone_executor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/phi_utils.h" +#include "paddle/fluid/framework/prune.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/save_load_util.h" +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/selected_rows_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/trainer.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/memory/allocation/allocator_strategy.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#endif +#include "paddle/fluid/memory/allocation/mmap_allocator.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/profiler.h" +#include "paddle/fluid/pybind/cuda_streams_py.h" +#include "paddle/fluid/pybind/distributed_py.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/io.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/lod_utils.h" +#include "paddle/utils/none.h" +#ifdef PADDLE_WITH_ASCEND +#include "paddle/fluid/pybind/ascend_wrapper_py.h" +#endif +#include "paddle/fluid/pybind/bind_cost_model.h" +#include "paddle/fluid/pybind/bind_fleet_executor.h" +#include "paddle/fluid/pybind/box_helper_py.h" +#include "paddle/fluid/pybind/communication.h" +#include "paddle/fluid/pybind/compatible.h" +#include "paddle/fluid/pybind/const_value.h" +#include "paddle/fluid/pybind/data_set_py.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/fleet_wrapper_py.h" +#include "paddle/fluid/pybind/generator_py.h" +#include "paddle/fluid/pybind/global_value_getter_setter.h" +#include "paddle/fluid/pybind/gloo_context_py.h" +#include "paddle/fluid/pybind/gloo_wrapper_py.h" +#include "paddle/fluid/pybind/heter_wrapper_py.h" +#include "paddle/fluid/pybind/inference_api.h" +#include "paddle/fluid/pybind/ir.h" +#include "paddle/fluid/pybind/metrics_py.h" +#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/phi/backends/device_manager.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/pybind/nccl_wrapper_py.h" +#endif +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/pybind/protobuf.h" +#include "paddle/fluid/pybind/pybind.h" // NOLINT +#include "paddle/fluid/pybind/reader_py.h" +#include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/fluid/string/to_string.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#ifndef PADDLE_WITH_HIP +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" +#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#endif + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" +#endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/capi/capi.h" +#endif + +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif + +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif + +#ifdef PADDLE_WITH_CRYPTO +#include "paddle/fluid/pybind/crypto.h" +#endif + +#if defined PADDLE_WITH_PSCORE +#include "paddle/fluid/pybind/fleet_py.h" +#endif + +#ifdef PADDLE_WITH_CINN +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#endif + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/imperative/layout_autotune.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/place.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" +#include "pybind11/stl.h" + +DECLARE_bool(use_mkldnn); + +// disable auto conversion to list in Python +PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); + +namespace paddle { +namespace pybind { +PyTypeObject *g_place_pytype = nullptr; +PyTypeObject *g_customplace_pytype = nullptr; +PyTypeObject *g_cudaplace_pytype = nullptr; +PyTypeObject *g_cpuplace_pytype = nullptr; +PyTypeObject *g_xpuplace_pytype = nullptr; +PyTypeObject *g_npuplace_pytype = nullptr; +PyTypeObject *g_cudapinnedplace_pytype = nullptr; +PyTypeObject *g_mluplace_pytype = nullptr; + +template +static inline int PlaceIndex(const PlaceType &p) { // NOLINT + return static_cast(paddle::platform::Place(p).GetType()); +} + +template +static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) { + return paddle::platform::Place(p1) == paddle::platform::Place(p2); +} + +void BindPlace(pybind11::module &m) { // NOLINT + using namespace paddle::framework; // NOLINT + py::class_ customplace(m, + "CustomPlace", + R"DOC( + CustomPlace is a descriptor of a device. + It represents a custom device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + + import paddle + fake_cpu_place = paddle.CustomPlace("FakeCPU", 0) + )DOC"); + g_customplace_pytype = reinterpret_cast(customplace.ptr()); + customplace + .def("__init__", + [](platform::CustomPlace &self, + const std::string &device_type, + int dev_id) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), device id must be 0 " + "or " + "positive integer", + device_type, + dev_id); + std::exit(-1); + } + + if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) && + phi::DeviceManager::IsCustom(device_type))) { + int dev_count = static_cast( + phi::DeviceManager::GetDeviceCount(device_type)); + if (UNLIKELY(dev_id >= dev_count)) { + if (dev_count == 0) { + LOG(ERROR) << "Cannot use " << device_type + << " because there is no " << device_type + << " detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), dev_id must " + "inside " + "[0, %d), because %s " + "number on your machine is %d", + device_type, + dev_id, + dev_count, + device_type, + dev_count); + std::exit(-1); + } + } + new (&self) platform::CustomPlace(device_type, dev_id); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), the device type is " + "not registered " + "as a custom device.", + device_type, + dev_id); + std::exit(-1); + } +#else + LOG(ERROR) << string::Sprintf( + "Cannot use CustomDevice because you have installed CPU/GPU" + "version PaddlePaddle.\n" + "If you want to use CustomDevice, please try to install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle\n" + "If you only have CPU, please change " + "CustomPlace(%s, %d) to be CPUPlace().\n", + device_type, dev_id); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("get_device_id", + [](const platform::CustomPlace &self) { return self.GetDeviceId(); }) + .def("get_device_type", + [](const platform::CustomPlace &self) { + return self.GetDeviceType(); + }) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); + py::class_ cudaplace(m, "CUDAPlace", R"DOC( + + CUDAPlace is a descriptor of a device. + It represents a GPU device allocated or to be allocated with Tensor or LoDTensor. + Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace, + staring from 0. + The memory of CUDAPlace with different dev_id is not accessible. + Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card. + You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable. + When the program starts, visible GPU devices will be numbered from 0. + If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default, + and the logical ID is the same as the actual ID. + + Parameters: + id (int): GPU device ID. + + Examples: + .. code-block:: python + + import paddle + + place = paddle.CUDAPlace(0) + + )DOC"); + g_cudaplace_pytype = reinterpret_cast(cudaplace.ptr()); + cudaplace + .def("__init__", + [](platform::CUDAPlace &self, int dev_id) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid CUDAPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + + if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) { + if (platform::GetGPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use GPU because there is no GPU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CUDAPlace(%d), must inside [0, %d), because GPU " + "number on your machine is %d", + dev_id, + platform::GetGPUDeviceCount(), + platform::GetGPUDeviceCount()); + std::exit(-1); + } + } + + new (&self) platform::CUDAPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use GPU because you have installed CPU version " + "PaddlePaddle.\n" + "If you want to use GPU, please try to install GPU version " + "PaddlePaddle by: pip install paddlepaddle-gpu\n" + "If you only have CPU, please change CUDAPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + .def("get_device_id", + [](const platform::CUDAPlace &self) { return self.GetDeviceId(); }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_get_device_id", + [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); }) +#endif + .def("__repr__", string::to_string) + .def("__str__", string::to_string); + + py::class_ xpuplace(m, "XPUPlace", R"DOC( + **Note**: + Examples: + .. code-block:: python + import paddle.fluid as fluid + xpu_place = fluid.XPUPlace(0) + )DOC"); + g_xpuplace_pytype = reinterpret_cast(xpuplace.ptr()); + xpuplace + .def("__init__", + [](platform::XPUPlace &self, int dev_id) { +#ifdef PADDLE_WITH_XPU + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid XPUPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) { + if (platform::GetXPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use XPU because there is no XPU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid XPUPlace(%d), must inside [0, %d), because XPU " + "number on your machine is %d", + dev_id, + platform::GetXPUDeviceCount(), + platform::GetXPUDeviceCount()); + std::exit(-1); + } + } + new (&self) platform::XPUPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use XPU because you have installed CPU/GPU version " + "PaddlePaddle.\n" + "If you want to use XPU, please try to install XPU version " + "PaddlePaddle by: pip install paddlepaddle-xpu\n" + "If you only have CPU, please change XPUPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) +#ifdef PADDLE_WITH_XPU + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("get_device_id", + [](const platform::XPUPlace &self) { return self.GetDeviceId(); }) +#endif + .def("__repr__", string::to_string) + .def("__str__", string::to_string); +#ifdef PADDLE_WITH_XPU + py::enum_(m, "XPUVersion", py::arithmetic()) + .value("XPU1", phi::backends::xpu::XPUVersion::XPU1) + .value("XPU2", phi::backends::xpu::XPUVersion::XPU2) + .export_values(); + m.def("get_xpu_device_count", platform::GetXPUDeviceCount); + m.def("get_xpu_device_version", + [](int device_id) { return platform::get_xpu_version(device_id); }); +#ifdef PADDLE_WITH_XPU_KP + m.def("get_xpu_device_op_support_types", + [](const std::string &op_name, phi::backends::xpu::XPUVersion version) { + return platform::get_xpu_kp_op_support_type(op_name, version); + }); +#else + m.def("get_xpu_device_op_support_types", + [](const std::string &op_name, phi::backends::xpu::XPUVersion version) { + return platform::get_xpu_op_support_type(op_name, version); + }); +#endif + m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) { + return platform::get_xpu_op_list(version); + }); + m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool { + // XPUs with Compute Capability > xpu2 support float16 and bfloat16 + return platform::get_xpu_version(place.device) > + phi::backends::xpu::XPUVersion::XPU1; + }); + m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool { + // XPUs with Compute Capability > xpu2 support float16 and bfloat16 + return platform::get_xpu_version(place.device) > + phi::backends::xpu::XPUVersion::XPU1; + }); +#endif + + py::class_ cpuplace(m, "CPUPlace", R"DOC( + CPUPlace is a descriptor of a device. + It represents a CPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + + import paddle + cpu_place = paddle.CPUPlace() + + )DOC"); + g_cpuplace_pytype = reinterpret_cast(cpuplace.ptr()); + cpuplace.def(py::init<>()) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); + + py::class_ cudapinnedplace( + m, "CUDAPinnedPlace", R"DOC( + CUDAPinnedPlace is a descriptor of a device. + It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory. + The host operating system will not paging and exchanging the memory. + It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU. + For more information on CUDA data transfer and `pinned memory`, + please refer to `official document `_ . + + Examples: + .. code-block:: python + + import paddle + place = paddle.CUDAPinnedPlace() + + )DOC"); + g_cudapinnedplace_pytype = + reinterpret_cast(cudapinnedplace.ptr()); + cudapinnedplace + .def("__init__", + [](platform::CUDAPinnedPlace &self) { +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CUDAPinnedPlace in CPU only version, " + "Please recompile or reinstall Paddle with CUDA support.")); +#endif + new (&self) platform::CUDAPinnedPlace(); + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); + + // NPUPlace + py::class_ npuplace(m, "NPUPlace", R"DOC( + NPUPlace is a descriptor of a device. + It represents a NPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + npu_place = paddle.NPUPlace(0) + + )DOC"); + g_npuplace_pytype = reinterpret_cast(npuplace.ptr()); + npuplace + .def("__init__", + [](platform::NPUPlace &self, int dev_id) { +#ifdef PADDLE_WITH_ASCEND_CL + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid NPUPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) { + if (platform::GetNPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use NPU because there is no NPU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid NPUPlace(%d), must inside [0, %d), because NPU " + "number on your machine is %d", + dev_id, + platform::GetNPUDeviceCount(), + platform::GetNPUDeviceCount()); + std::exit(-1); + } + } + new (&self) platform::NPUPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use NPU because you have installed CPU/GPU version " + "PaddlePaddle.\n" + "If you want to use NPU, please try to install NPU version " + "PaddlePaddle by: pip install paddlepaddle-npu\n" + "If you only have CPU, please change NPUPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("get_device_id", + [](const platform::NPUPlace &self) { return self.GetDeviceId(); }) + .def("__str__", string::to_string); + + // IPUPlace + py::class_(m, "IPUPlace", R"DOC( + IPUPlace is a descriptor of a device. + It represents a IPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + + # required: ipu + + ipu_place = paddle.IPUPlace() + + )DOC") + .def("__init__", + [](platform::IPUPlace &self) { +#ifdef PADDLE_WITH_IPU + if (platform::GetIPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use IPU because there is no IPU " + "detected on your " + "machine."; + std::exit(-1); + } + // use ipu(0) to comile, while run with the number user configure + // in sharding and pipline. + new (&self) platform::IPUPlace(0); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use IPU because you didn't install IPU version " + "PaddlePaddle.\n" + "If you want to use IPU, please try to install IPU version " + "PaddlePaddle by: pip install paddlepaddle*\n" + "If you only have CPU, please change IPUPlace to be " + "CPUPlace().\n"); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) +#ifdef PADDLE_WITH_IPU + .def("get_device_id", + [](const platform::IPUPlace &self) { return self.GetDeviceId(); }) +#endif + .def("__str__", string::to_string); + + // MLUPlace + py::class_ mluplace(m, "MLUPlace", R"DOC( + MLUPlace is a descriptor of a device. + It represents a MLU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + # required: mlu + mlu_place = paddle.MLUPlace(0) + + )DOC"); + g_mluplace_pytype = reinterpret_cast(mluplace.ptr()); + mluplace + .def("__init__", + [](platform::MLUPlace &self, int dev_id) { +#ifdef PADDLE_WITH_MLU + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid MLUPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) { + if (platform::GetMLUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use MLU because there is no MLU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid MLUPlace(%d), must inside [0, %d), because MLU " + "number on your machine is %d", + dev_id, + platform::GetMLUDeviceCount(), + platform::GetMLUDeviceCount()); + std::exit(-1); + } + } + new (&self) platform::MLUPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use MLU because you have installed CPU/GPU/... " + "version " + "PaddlePaddle.\n" + "If you want to use MLU, please try to install MLU version " + "PaddlePaddle by: pip install paddlepaddle-mlu\n" + "If you only have CPU, please change MLUPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) +#ifdef PADDLE_WITH_MLU + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("get_device_id", + [](const platform::MLUPlace &self) { return self.GetDeviceId(); }) +#endif + .def("__str__", string::to_string); + + py::class_ platformplace(m, "Place"); + g_place_pytype = reinterpret_cast(platformplace.ptr()); + platformplace.def(py::init<>()) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("is_gpu_place", + [](platform::Place &self) { return platform::is_gpu_place(self); }) + .def("is_cpu_place", + [](platform::Place &self) { return platform::is_cpu_place(self); }) + .def("is_xpu_place", + [](platform::Place &self) { return platform::is_xpu_place(self); }) + .def("is_npu_place", + [](platform::Place &self) { return platform::is_npu_place(self); }) + .def("is_ipu_place", + [](platform::Place &self) { return platform::is_ipu_place(self); }) + .def("is_cuda_pinned_place", + [](platform::Place &self) { + return platform::is_cuda_pinned_place(self); + }) + .def("is_mlu_place", + [](platform::Place &self) { return platform::is_mlu_place(self); }) + .def( + "is_custom_place", + [](platform::Place &self) { return platform::is_custom_place(self); }) + .def("gpu_device_id", [](platform::Place &self) { return self.device; }) + .def("xpu_device_id", [](platform::Place &self) { return self.device; }) + .def("npu_device_id", [](platform::Place &self) { return self.device; }) + .def("ipu_device_id", [](platform::Place &self) { return self.device; }) + .def("mlu_device_id", [](platform::Place &self) { return self.device; }) + .def("custom_device_id", + [](platform::Place &self) { return self.device; }) + .def("set_place", + [](platform::Place &self, const platform::Place &other) { + self = other; + }) + .def("set_place", + [](platform::Place &self, const platform::CPUPlace &cpu_place) { + self = cpu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::XPUPlace &xpu_place) { + self = xpu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::CUDAPlace &gpu_place) { + self = gpu_place; + }) + .def("set_place", + [](platform::Place &self, + const platform::CUDAPinnedPlace &cuda_pinned_place) { + self = cuda_pinned_place; + }) + .def("set_place", + [](platform::Place &self, const platform::NPUPlace &npu_place) { + self = npu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::IPUPlace &ipu_place) { + self = ipu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::MLUPlace &mlu_place) { + self = mlu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::CustomPlace &plug_place) { + self = plug_place; + }) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/place.h b/paddle/fluid/pybind/place.h new file mode 100644 index 0000000000..40fb8d4c7f --- /dev/null +++ b/paddle/fluid/pybind/place.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" + +namespace paddle { +namespace pybind { + +void BindPlace(pybind11::module& m); // NOLINT + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 62f0402bed..40a03248cd 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -122,9 +122,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/nccl_wrapper_py.h" #endif #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/pybind/parallel_executor.h" +#include "paddle/fluid/pybind/place.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/reader_py.h" +#include "paddle/fluid/pybind/tensor.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -194,16 +197,7 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); namespace paddle { namespace pybind { -PyTypeObject *g_place_pytype = nullptr; PyTypeObject *g_framework_scope_pytype = nullptr; -PyTypeObject *g_cudaplace_pytype = nullptr; -PyTypeObject *g_cpuplace_pytype = nullptr; -PyTypeObject *g_xpuplace_pytype = nullptr; -PyTypeObject *g_npuplace_pytype = nullptr; -PyTypeObject *g_cudapinnedplace_pytype = nullptr; -PyTypeObject *g_mluplace_pytype = nullptr; -PyTypeObject *g_customplace_pytype = nullptr; -PyTypeObject *g_framework_tensor_pytype = nullptr; PyTypeObject *g_framework_lodtensorarray_pytype = nullptr; PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr; @@ -349,16 +343,6 @@ bool IsCompiledWithDIST() { #endif } -template -static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) { - return paddle::platform::Place(p1) == paddle::platform::Place(p2); -} - -template -static inline int PlaceIndex(const PlaceType &p) { - return static_cast(paddle::platform::Place(p).GetType()); -} - static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) { // NOTE(zjl): PyObject_GetAttrString would return nullptr when attr_name // is not inside obj, but it would also set the error flag of Python. @@ -541,19 +525,6 @@ static int GetNCCLVersion() { } #endif -template -static void TensorCopyFrom(framework::Tensor *dst, - const framework::Tensor &src, - const PlaceType &place, - int64_t batch_size) { - if (batch_size < 0) { - framework::TensorCopy(src, place, dst); - } else { - auto sliced = src.Slice(0, batch_size); - framework::TensorCopy(sliced, place, dst); - } -} - #ifdef PADDLE_WITH_AVX PYBIND11_MODULE(core_avx, m) { #else @@ -854,897 +825,6 @@ PYBIND11_MODULE(core_noavx, m) { self.EmplaceBackAttr(attr); }); - py::class_ framework_tensor( - m, "Tensor", py::buffer_protocol()); - g_framework_tensor_pytype = - reinterpret_cast(framework_tensor.ptr()); - framework_tensor - .def("__array__", - [](framework::Tensor &self) { return TensorToPyArray(self); }) - .def("_ptr", - [](const framework::Tensor &self) { - return reinterpret_cast(self.data()); - }) - .def("_slice", &framework::Tensor::Slice) - .def("_numel", &framework::Tensor::numel) - .def("_is_initialized", - [](const framework::Tensor &self) { return self.IsInitialized(); }) - .def("_get_dims", - [](const framework::Tensor &self) { return vectorize(self.dims()); }) - .def("_set_dims", - [](framework::Tensor &self, const std::vector &dim) { - self.Resize(phi::make_ddim(dim)); - }) - .def("_set_layout", - [](framework::Tensor &self, const std::string &layout) { - self.set_layout(StringToDataLayout(layout)); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::CustomPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::CUDAPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::XPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::CPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::NPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::MLUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_double", - [](framework::Tensor &self, paddle::platform::CPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::CPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::CustomPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::XPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::CUDAPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::MLUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, - paddle::platform::CUDAPinnedPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, - paddle::platform::CUDAPinnedPlace &place) { - self.mutable_data(place); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::CPUPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::CustomPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::XPUPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::CUDAPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::CUDAPinnedPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::MLUPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_clear", &framework::Tensor::clear) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::NPUPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false, - R"DOC( - Set the data of Tensor on place with given numpy array. - - Args: - lod (numpy.ndarray): The data to set. - place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the - Tensor is to be set. - zero_copy (bool, optional): Whether to share memory with the input numpy array. - This parameter only works with CPUPlace. Default: False. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - )DOC") - - .def( - "shape", - [](framework::Tensor &self) { return vectorize(self.dims()); }, - R"DOC( - Return the shape of Tensor. - - Returns: - list[int]: The shape of Tensor. - - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - print(t.shape()) # [5, 30] - )DOC") - .def("_to_dlpack", - [](framework::Tensor &self) { - DLPackTensor dlpack_tensor(self, 1); - DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor(); - auto capsule = py::capsule( - static_cast(dmt), "dltensor", [](PyObject *ptr) { - if (ptr) { - auto dltensor = new DLManagedTensor; - try { - dltensor = reinterpret_cast( - PyCapsule_GetPointer(ptr, "used_dltensor")); - return; - } catch (...) { - dltensor = reinterpret_cast( - PyCapsule_GetPointer(ptr, "dltensor")); - } - dltensor->deleter(dltensor); - } - }); - return capsule; - }) - .def("_set_float_element", TensorSetElement) - .def("_get_float_element", TensorGetElement) - .def("_set_double_element", TensorSetElement) - .def("_get_double_element", TensorGetElement) - .def("_place", [](framework::Tensor &self) { return self.place(); }) - .def("_dtype", - [](framework::Tensor &self) { - return framework::TransToProtoVarType(self.type()); - }) - .def("_layout", - [](framework::Tensor &self) { - return DataLayoutToString(self.layout()); - }) - .def("_share_data_with", &framework::Tensor::ShareDataWith) - .def("__getitem__", PySliceTensor, py::return_value_policy::reference) - .def("__str__", - [](const framework::Tensor &self) { - std::stringstream ostr; - ostr << self; - return ostr.str(); - }) /* ------ End of original Tensor ------ */ - .def("__init__", - [](framework::Tensor &instance, - const std::vector> - &recursive_sequence_lengths) { - LoD new_lod; - new_lod.reserve(recursive_sequence_lengths.size()); - std::copy(recursive_sequence_lengths.begin(), - recursive_sequence_lengths.end(), - std::back_inserter(new_lod)); - LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE_EQ( - CheckLoD(new_offset_lod, -1), - true, - platform::errors::InvalidArgument( - "The provided recursive_sequence_lengths info is " - "invalid, " - "the LoD converted by recursive_sequence_lengths is %s", - new_lod)); - new (&instance) framework::Tensor(new_offset_lod); - }) - .def("__init__", - [](framework::Tensor &instance) { - new (&instance) framework::Tensor(); - }) - // We implement offset based LOD in C++ while we use length based with - // Python API. So we changed set_lod to set_recursive_sequence_lengths - // to - // avoid misuse. - // The discussion is here: - // https://github.com/PaddlePaddle/Paddle/issues/10855 - .def( - "set_lod", - [](framework::Tensor &self, - const std::vector> &lod) { - // the input lod is offset-based level-of-detail info - LoD new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - PADDLE_ENFORCE_EQ( - CheckLoD(new_lod, vectorize(self.dims()).front()), - true, - platform::errors::InvalidArgument( - "The provided LoD is invalid, the LoD is %s", new_lod)); - self.set_lod(new_lod); - }, - py::arg("lod"), - R"DOC( - Set LoD of the Tensor. - - Args: - lod (list[list[int]]): The lod to set. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_lod([[0, 2, 5]]) - print(t.lod()) # [[0, 2, 5]] - )DOC") - .def( - "set_recursive_sequence_lengths", - [](framework::Tensor &self, - const std::vector> - &recursive_sequence_lengths) { - // the input recursive_sequence_lengths is length-based - // level-of-detail info - LoD new_lod; - new_lod.reserve(recursive_sequence_lengths.size()); - std::copy(recursive_sequence_lengths.begin(), - recursive_sequence_lengths.end(), - std::back_inserter(new_lod)); - LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE_EQ( - CheckLoD(new_offset_lod, vectorize(self.dims()).front()), - true, - platform::errors::InvalidArgument( - "The provided recursive_sequence_lengths info is " - "invalid, " - "the LoD converted by recursive_sequence_lengths is " - "%s", - new_lod)); - self.set_lod(new_offset_lod); - }, - py::arg("recursive_sequence_lengths"), - R"DOC( - Set LoD of the Tensor according to recursive sequence lengths. - - For example, if recursive_sequence_lengths=[[2, 3]], which means - there are two sequences with length 2 and 3 respectively, the - corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]]. - - Args: - recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_recursive_sequence_lengths([[2, 3]]) - print(t.recursive_sequence_lengths()) # [[2, 3]] - print(t.lod()) # [[0, 2, 5]] - )DOC") - .def( - "lod", - [](framework::Tensor &self) -> std::vector> { - // output the offset-based lod info - LoD lod = self.lod(); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return new_lod; - }, - R"DOC( - Return the LoD of the Tensor. - - Returns: - list[list[int]]: The lod of the Tensor. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_lod([[0, 2, 5]]) - print(t.lod()) # [[0, 2, 5]] - )DOC") - // Set above comments of set_lod. - .def( - "recursive_sequence_lengths", - [](framework::Tensor &self) -> std::vector> { - // output the length-based lod info - LoD lod = phi::ConvertToLengthBasedLoD(self.lod()); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return new_lod; - }, - R"DOC( - Return the recursive sequence lengths corresponding to of the LodD - of the Tensor. - - Returns: - list[list[int]]: The recursive sequence lengths. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_recursive_sequence_lengths([[2, 3]]) - print(t.recursive_sequence_lengths()) # [[2, 3]] - )DOC") - .def( - "has_valid_recursive_sequence_lengths", - [](framework::Tensor &self) -> bool { - // Check that the lod info is valid and match the outermost - // dimension of the Tensor data - return CheckLoD(self.lod(), vectorize(self.dims()).front()); - }, - R"DOC( - Check whether the LoD of the Tensor is valid. - - Returns: - bool: Whether the LoD is valid. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_recursive_sequence_lengths([[2, 3]]) - print(t.has_valid_recursive_sequence_lengths()) # True - )DOC") - .def("_as_type", - [](const framework::Tensor &self, - paddle::framework::proto::VarType::Type type) { - framework::Tensor dst; - if (self.IsInitialized() && self.numel() > 0) { - TransDataType(self, type, &dst); - } - return dst; - }) - .def("_copy", - [](const framework::Tensor &self, const platform::Place &place) { - // follow fetch_op's inplementation - framework::Tensor dst; - if (self.IsInitialized() && self.numel() > 0) { - TensorCopySync(self, place, &dst); - } else { - // Not copy, if the src tensor is empty. - dst.clear(); - dst.Resize({0}); - } - dst.set_lod(self.lod()); - return dst; -#ifdef _WIN32 - }); -#else - }) -#ifdef PADDLE_WITH_CUDA - .def("_share_buffer_with", - [](framework::Tensor &self, const framework::Tensor src, - py::tuple t) { - auto *cuda_ipc_allocation = - dynamic_cast( - src.Holder().get()); - - PADDLE_ENFORCE_NOT_NULL( - cuda_ipc_allocation, - platform::errors::PreconditionNotMet( - "Tensor is not Cuda IPC shared tensor. " - "Now only Tensor shared by cuda ipc could use this " - "api.")); - - size_t size = t[0].cast(); - auto dtype = - static_cast(t[1].cast()); - auto dims = phi::make_ddim(t[2].cast>()); - auto lod_info = t[3].cast(); - auto device_id = t[4].cast(); - - auto shared_reader_holder = - std::make_shared( - cuda_ipc_allocation->ptr(), - cuda_ipc_allocation->base_ptr(), size, - platform::CUDAPlace(device_id)); - - self.ResetHolderWithType(shared_reader_holder, dtype); - self.Resize(dims); - self.set_lod(lod_info); - - VLOG(6) << "Reconstructed tensor with buffer shared!"; - }, - R"DOC( - Deserialize GPU Tensor for existed shared Cuda IPC tensor. - - Params: - tensor: Shared Cuda IPC tensor. - tuple: contrains data size, data type, - tensor dims, lod information, device index. - - )DOC") - .def("_share_cuda", - [](framework::Tensor self) { - if (!self.IsInitialized() || self.numel() == 0) - throw std::runtime_error( - "Tensor not initialized or numel is 0. could not pass " - "to shared memory. "); - - auto *holder = dynamic_cast( - self.Holder().get()); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(holder->place()), true, - platform::errors::InvalidArgument( - "Tensor is not on GPU. share_cuda only support GPU " - "Tensor, share_filename is for CPU tensor.")); - - void *base_ptr = holder->base_ptr(); - ptrdiff_t offset_bytes = reinterpret_cast(holder->ptr()) - - reinterpret_cast(base_ptr); - - cudaIpcMemHandle_t handle; - PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr)); - - auto _handle = py::bytes(reinterpret_cast(&handle), - (py::ssize_t)CUDA_IPC_HANDLE_SIZE); - - // TODO(ZHUI): use cuda event, to avoid sync. - const auto &device_id = paddle::platform::GetCurrentDeviceId(); - auto stream = - paddle::platform::stream::get_current_stream(device_id); - stream->Synchronize(); - - int type_idx = static_cast(self.type()); - size_t data_size = - self.numel() * - framework::SizeOfType( - framework::TransToProtoVarType(self.type())); - - return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size, - type_idx, vectorize(self.dims()), self.lod(), - device_id); - }, - R"DOC( - Serialize GPU Tensor by cudaIpcMemHandle. - - Returns: - tuple: contrains handle, data size, data type, - tensor dims, lod information, device index. - - Examples: - .. code-block:: python - - import paddle - tensor = paddle.ones([3,3]) - metainfo = tensor.value().get_tensor()._share_cuda() - - )DOC") - .def("_new_shared_cuda", - [](py::tuple t) { - if (t.size() != 7) - throw std::runtime_error( - "Invalid Tensor meta info for shared cuda tensor!"); - - // 1. Create a new C++ instance - framework::Tensor tensor; - - // 2. Rebuild Allocation from handle - const std::string &handle = t[0].cast(); - ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast(); - auto device_id = t[6].cast(); - auto base_ptr = memory::allocation::GetIpcBasePtr(handle); - size_t size = t[2].cast(); - void *dev = base_ptr.get(); - dev = reinterpret_cast(dev) + offset_bytes; - - auto shared_reader_holder = - std::make_shared( - dev, size, device_id, std::move(base_ptr)); - - // 3. Rebuild Tensor - tensor.ResetHolderWithType( - shared_reader_holder, - static_cast(t[3].cast())); - tensor.Resize(phi::make_ddim(t[4].cast>())); - tensor.set_lod(t[5].cast()); - - return tensor; - }, - R"DOC( - Deserialize GPU lod tensor from cudaIpcMemHandle. - - Params: - tuple: contrains handle, data size, data type, - tensor dims, lod information, device index. - - Examples: - .. code-block:: python - - import paddle - tensor = paddle.ones([3,3]) - metainfo = tensor.value().get_tensor()._share_cuda() - tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo)) - - )DOC") -#endif - .def("_share_filename", - [](framework::Tensor &self) { - if (!self.IsInitialized() || self.numel() == 0) - throw std::runtime_error( - "Tensor not initialized or numel is 0. could not pass to " - "shared memory. "); - - auto holder = self.Holder(); - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(holder->place()) || - platform::is_cuda_pinned_place(holder->place()), - true, platform::errors::InvalidArgument( - "Tensor is not on CPU. share_filename only " - "support CPU Tensor.")); - - auto *mmap_allocation = dynamic_cast< - memory::allocation::RefcountedMemoryMapAllocation *>( - holder.get()); - // If the tensor is not shared, allocate memory map allocation. - if (mmap_allocation == nullptr) { - void *data_ptr = self.data(); - size_t data_size = - self.numel() * - framework::SizeOfType( - framework::TransToProtoVarType(self.type())); - - int flags = memory::allocation::MAPPED_SHAREDMEM | - memory::allocation::MAPPED_EXCLUSIVE; - std::string handle = memory::allocation::GetIPCName(); - auto shared_holder = - memory::allocation::AllocateRefcountedMemoryMapAllocation( - handle, flags, data_size); - - // copy data & reset holder - if (platform::is_cuda_pinned_place(holder->place())) { -#ifdef PADDLE_WITH_CUDA - memory::Copy(platform::CPUPlace(), shared_holder->ptr(), - platform::CUDAPinnedPlace(), data_ptr, data_size); -#endif - } else { - memory::Copy(platform::CPUPlace(), shared_holder->ptr(), - platform::CPUPlace(), data_ptr, data_size); - } - self.ResetHolder(shared_holder); - mmap_allocation = shared_holder.get(); - } - int type_idx = static_cast(self.type()); - - return py::make_tuple(mmap_allocation->ipc_name(), - mmap_allocation->size(), type_idx, - vectorize(self.dims()), self.lod()); - }, - R"DOC( - Serialize CPU lod tensor in shared memory to tuple. - If the tensor is not in shared memory, we will copy it first. - - Returns: - tuple: contrains ipc name, data size, data type, - tensor dims and lod imformation. - - Examples: - .. code-block:: python - - import paddle - tensor = paddle.ones([3,3]) - metainfo = tensor.value().get_tensor()._share_filename() - - )DOC") - .def("_new_shared_filename", - [](py::tuple t) { // __setstate__ - if (t.size() != 5) - throw std::runtime_error("Invalid Tensor meta info state!"); - - framework::Tensor tensor; - - // 2. Rebuild Allocation - const std::string &ipc_name = t[0].cast(); - size_t size = t[1].cast(); - int flags = memory::allocation::MAPPED_SHAREDMEM | - memory::allocation::MAPPED_NOCREATE; - - auto shared_holder = - memory::allocation::AllocateRefcountedMemoryMapAllocation( - ipc_name, flags, size); - - // 3. Rebuild Tensor - tensor.ResetHolderWithType( - shared_holder, - static_cast(t[2].cast())); - tensor.Resize(phi::make_ddim(t[3].cast>())); - tensor.set_lod(t[4].cast()); - - return tensor; - }, - R"DOC( - Deserialize CPU lod tensor from shared memory. - - Params: - tuple: contrains ipc file name, data size, data type, - tensor dims and lod information. - - Examples: - .. code-block:: python - - import paddle - tensor = paddle.ones([3,3]) - metainfo = tensor.value().get_tensor()._share_filename() - tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo)) - - )DOC") - .def("_shared_incref", - [](framework::Tensor &self) { - auto *mmap_allocation = dynamic_cast< - memory::allocation::RefcountedMemoryMapAllocation *>( - self.Holder().get()); - if (mmap_allocation) { - mmap_allocation->incref(); - } - }, - R"DOC( - Increase reference count of share_filename tensor. - )DOC") - .def("_shared_decref", - [](framework::Tensor &self) { - auto *mmap_allocation = dynamic_cast< - memory::allocation::RefcountedMemoryMapAllocation *>( - self.Holder().get()); - if (mmap_allocation) { - mmap_allocation->decref(); - } - }, - R"DOC( - Decrease reference count of share_filename tensor. - )DOC") - .def(py::pickle( - [](const framework::Tensor &t) { // __getstate__ - auto holder = t.Holder(); - PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true, - platform::errors::PreconditionNotMet( - "Tensor is not on CPU." - "Now only Tensor on CPU can be serialized.")); - auto *mmap_writer_allocation = - dynamic_cast( - holder.get()); - PADDLE_ENFORCE_NOT_NULL( - mmap_writer_allocation, - platform::errors::PreconditionNotMet( - "Tensor is not in shared memory." - "Now only Tensor on shared memory can be serialized.")); - int type_idx = static_cast(t.type()); - - return py::make_tuple(mmap_writer_allocation->ipc_name(), - mmap_writer_allocation->size(), type_idx, - vectorize(t.dims()), t.lod()); - }, - [](py::tuple t) { // __setstate__ - if (t.size() != 5) - throw std::runtime_error("Invalid Tensor state!"); - - // 1. Create a new C++ instance - framework::Tensor tensor; - - // 2. Rebuild Allocation - const std::string &ipc_name = t[0].cast(); - size_t size = t[1].cast(); - auto shared_reader_holder = - memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name, - size); - - // 3. Maintain global fd set - VLOG(3) << "Tensor ipc name: " << ipc_name; - memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); - - // 4. Rebuild Tensor - tensor.ResetHolderWithType( - shared_reader_holder, - static_cast(t[2].cast())); - tensor.Resize(phi::make_ddim(t[3].cast>())); - tensor.set_lod(t[4].cast()); - - return tensor; - })); -#endif - - py::class_(m, "SelectedRows") - .def("__init__", - [](phi::SelectedRows &instance) { - new (&instance) phi::SelectedRows(); - }) - .def("__init__", - [](phi::SelectedRows &instance, - const std::vector rows, - const int64_t &height) { - new (&instance) phi::SelectedRows(rows, height); - }) - .def( - "get_tensor", - [](phi::SelectedRows &self) { return self.mutable_value(); }, - py::return_value_policy::reference) - .def("numel", - [](phi::SelectedRows &self) -> int64_t { - return self.value().numel(); - }) - .def("set_height", &phi::SelectedRows::set_height) - .def("height", &phi::SelectedRows::height) - .def("set_rows", - [](phi::SelectedRows &self, std::vector rows) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) - self.set_rows(rows); -#else - Vector new_rows(rows); - self.set_rows(new_rows); -#endif - }) - .def("sync_index", - [](phi::SelectedRows &instance) { instance.SyncIndex(); }) - .def("rows", [](phi::SelectedRows &self) { - auto rows = self.rows(); - std::vector new_rows; - new_rows.reserve(rows.size()); - std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows)); - return new_rows; - }); - py::class_(m, "Variable", R"DOC(Variable Class. All parameter, weight, gradient are variables in Paddle. @@ -2272,603 +1352,6 @@ All parameter, weight, gradient are variables in Paddle. #endif return devices; }); - py::class_ customplace(m, - "CustomPlace", - R"DOC( - CustomPlace is a descriptor of a device. - It represents a custom device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - - import paddle - fake_cpu_place = paddle.CustomPlace("FakeCPU", 0) - )DOC"); - g_customplace_pytype = reinterpret_cast(customplace.ptr()); - customplace - .def("__init__", - [](platform::CustomPlace &self, - const std::string &device_type, - int dev_id) { -#ifdef PADDLE_WITH_CUSTOM_DEVICE - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid CustomPlace(%s, %d), device id must be 0 " - "or " - "positive integer", - device_type, - dev_id); - std::exit(-1); - } - - if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) && - phi::DeviceManager::IsCustom(device_type))) { - int dev_count = static_cast( - phi::DeviceManager::GetDeviceCount(device_type)); - if (UNLIKELY(dev_id >= dev_count)) { - if (dev_count == 0) { - LOG(ERROR) << "Cannot use " << device_type - << " because there is no " << device_type - << " detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid CustomPlace(%s, %d), dev_id must " - "inside " - "[0, %d), because %s " - "number on your machine is %d", - device_type, - dev_id, - dev_count, - device_type, - dev_count); - std::exit(-1); - } - } - new (&self) platform::CustomPlace(device_type, dev_id); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid CustomPlace(%s, %d), the device type is " - "not registered " - "as a custom device.", - device_type, - dev_id); - std::exit(-1); - } -#else - LOG(ERROR) << string::Sprintf( - "Cannot use CustomDevice because you have installed CPU/GPU" - "version PaddlePaddle.\n" - "If you want to use CustomDevice, please try to install" - "CustomDevice version " - "PaddlePaddle by: pip install paddlepaddle\n" - "If you only have CPU, please change " - "CustomPlace(%s, %d) to be CPUPlace().\n", - device_type, dev_id); - std::exit(-1); -#endif - }) - .def("_type", &PlaceIndex) - .def("get_device_id", - [](const platform::CustomPlace &self) { return self.GetDeviceId(); }) - .def("get_device_type", - [](const platform::CustomPlace &self) { - return self.GetDeviceType(); - }) - .def("__repr__", string::to_string) - .def("__str__", string::to_string); - py::class_ cudaplace(m, "CUDAPlace", R"DOC( - - CUDAPlace is a descriptor of a device. - It represents a GPU device allocated or to be allocated with Tensor or LoDTensor. - Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace, - staring from 0. - The memory of CUDAPlace with different dev_id is not accessible. - Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card. - You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable. - When the program starts, visible GPU devices will be numbered from 0. - If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default, - and the logical ID is the same as the actual ID. - - Parameters: - id (int): GPU device ID. - - Examples: - .. code-block:: python - - import paddle - - place = paddle.CUDAPlace(0) - - )DOC"); - g_cudaplace_pytype = reinterpret_cast(cudaplace.ptr()); - cudaplace - .def("__init__", - [](platform::CUDAPlace &self, int dev_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid CUDAPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - - if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) { - if (platform::GetGPUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use GPU because there is no GPU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid CUDAPlace(%d), must inside [0, %d), because GPU " - "number on your machine is %d", - dev_id, - platform::GetGPUDeviceCount(), - platform::GetGPUDeviceCount()); - std::exit(-1); - } - } - - new (&self) platform::CUDAPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use GPU because you have installed CPU version " - "PaddlePaddle.\n" - "If you want to use GPU, please try to install GPU version " - "PaddlePaddle by: pip install paddlepaddle-gpu\n" - "If you only have CPU, please change CUDAPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - .def("get_device_id", - [](const platform::CUDAPlace &self) { return self.GetDeviceId(); }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_get_device_id", - [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); }) -#endif - .def("__repr__", string::to_string) - .def("__str__", string::to_string); - - py::class_ xpuplace(m, "XPUPlace", R"DOC( - **Note**: - Examples: - .. code-block:: python - import paddle.fluid as fluid - xpu_place = fluid.XPUPlace(0) - )DOC"); - g_xpuplace_pytype = reinterpret_cast(xpuplace.ptr()); - xpuplace - .def("__init__", - [](platform::XPUPlace &self, int dev_id) { -#ifdef PADDLE_WITH_XPU - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid XPUPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) { - if (platform::GetXPUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use XPU because there is no XPU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid XPUPlace(%d), must inside [0, %d), because XPU " - "number on your machine is %d", - dev_id, - platform::GetXPUDeviceCount(), - platform::GetXPUDeviceCount()); - std::exit(-1); - } - } - new (&self) platform::XPUPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use XPU because you have installed CPU/GPU version " - "PaddlePaddle.\n" - "If you want to use XPU, please try to install XPU version " - "PaddlePaddle by: pip install paddlepaddle-xpu\n" - "If you only have CPU, please change XPUPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) -#ifdef PADDLE_WITH_XPU - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("get_device_id", - [](const platform::XPUPlace &self) { return self.GetDeviceId(); }) -#endif - .def("__repr__", string::to_string) - .def("__str__", string::to_string); -#ifdef PADDLE_WITH_XPU - py::enum_(m, "XPUVersion", py::arithmetic()) - .value("XPU1", phi::backends::xpu::XPUVersion::XPU1) - .value("XPU2", phi::backends::xpu::XPUVersion::XPU2) - .export_values(); - m.def("get_xpu_device_count", platform::GetXPUDeviceCount); - m.def("get_xpu_device_version", - [](int device_id) { return platform::get_xpu_version(device_id); }); -#ifdef PADDLE_WITH_XPU_KP - m.def("get_xpu_device_op_support_types", - [](const std::string &op_name, phi::backends::xpu::XPUVersion version) { - return platform::get_xpu_kp_op_support_type(op_name, version); - }); -#else - m.def("get_xpu_device_op_support_types", - [](const std::string &op_name, phi::backends::xpu::XPUVersion version) { - return platform::get_xpu_op_support_type(op_name, version); - }); -#endif - m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) { - return platform::get_xpu_op_list(version); - }); - m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool { - // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > - phi::backends::xpu::XPUVersion::XPU1; - }); - m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool { - // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > - phi::backends::xpu::XPUVersion::XPU1; - }); -#endif - - py::class_ cpuplace(m, "CPUPlace", R"DOC( - CPUPlace is a descriptor of a device. - It represents a CPU device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - - import paddle - cpu_place = paddle.CPUPlace() - - )DOC"); - g_cpuplace_pytype = reinterpret_cast(cpuplace.ptr()); - cpuplace.def(py::init<>()) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("__repr__", string::to_string) - .def("__str__", string::to_string); - - py::class_ cudapinnedplace( - m, "CUDAPinnedPlace", R"DOC( - CUDAPinnedPlace is a descriptor of a device. - It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory. - The host operating system will not paging and exchanging the memory. - It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU. - For more information on CUDA data transfer and `pinned memory`, - please refer to `official document `_ . - - Examples: - .. code-block:: python - - import paddle - place = paddle.CUDAPinnedPlace() - - )DOC"); - g_cudapinnedplace_pytype = - reinterpret_cast(cudapinnedplace.ptr()); - cudapinnedplace - .def("__init__", - [](platform::CUDAPinnedPlace &self) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) - PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot use CUDAPinnedPlace in CPU only version, " - "Please recompile or reinstall Paddle with CUDA support.")); -#endif - new (&self) platform::CUDAPinnedPlace(); - }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("__repr__", string::to_string) - .def("__str__", string::to_string); - - // NPUPlace - py::class_ npuplace(m, "NPUPlace", R"DOC( - NPUPlace is a descriptor of a device. - It represents a NPU device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - import paddle - npu_place = paddle.NPUPlace(0) - - )DOC"); - g_npuplace_pytype = reinterpret_cast(npuplace.ptr()); - npuplace - .def("__init__", - [](platform::NPUPlace &self, int dev_id) { -#ifdef PADDLE_WITH_ASCEND_CL - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid NPUPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) { - if (platform::GetNPUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use NPU because there is no NPU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid NPUPlace(%d), must inside [0, %d), because NPU " - "number on your machine is %d", - dev_id, - platform::GetNPUDeviceCount(), - platform::GetNPUDeviceCount()); - std::exit(-1); - } - } - new (&self) platform::NPUPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use NPU because you have installed CPU/GPU version " - "PaddlePaddle.\n" - "If you want to use NPU, please try to install NPU version " - "PaddlePaddle by: pip install paddlepaddle-npu\n" - "If you only have CPU, please change NPUPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("get_device_id", - [](const platform::NPUPlace &self) { return self.GetDeviceId(); }) - .def("__str__", string::to_string); - - // IPUPlace - py::class_(m, "IPUPlace", R"DOC( - IPUPlace is a descriptor of a device. - It represents a IPU device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - import paddle - - # required: ipu - - ipu_place = paddle.IPUPlace() - - )DOC") - .def("__init__", - [](platform::IPUPlace &self) { -#ifdef PADDLE_WITH_IPU - if (platform::GetIPUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use IPU because there is no IPU " - "detected on your " - "machine."; - std::exit(-1); - } - // use ipu(0) to comile, while run with the number user configure - // in sharding and pipline. - new (&self) platform::IPUPlace(0); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use IPU because you didn't install IPU version " - "PaddlePaddle.\n" - "If you want to use IPU, please try to install IPU version " - "PaddlePaddle by: pip install paddlepaddle*\n" - "If you only have CPU, please change IPUPlace to be " - "CPUPlace().\n"); - std::exit(-1); -#endif - }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) -#ifdef PADDLE_WITH_IPU - .def("get_device_id", - [](const platform::IPUPlace &self) { return self.GetDeviceId(); }) -#endif - .def("__str__", string::to_string); - - // MLUPlace - py::class_ mluplace(m, "MLUPlace", R"DOC( - MLUPlace is a descriptor of a device. - It represents a MLU device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - import paddle - # required: mlu - mlu_place = paddle.MLUPlace(0) - - )DOC"); - g_mluplace_pytype = reinterpret_cast(mluplace.ptr()); - mluplace - .def("__init__", - [](platform::MLUPlace &self, int dev_id) { -#ifdef PADDLE_WITH_MLU - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid MLUPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) { - if (platform::GetMLUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use MLU because there is no MLU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid MLUPlace(%d), must inside [0, %d), because MLU " - "number on your machine is %d", - dev_id, - platform::GetMLUDeviceCount(), - platform::GetMLUDeviceCount()); - std::exit(-1); - } - } - new (&self) platform::MLUPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use MLU because you have installed CPU/GPU/... " - "version " - "PaddlePaddle.\n" - "If you want to use MLU, please try to install MLU version " - "PaddlePaddle by: pip install paddlepaddle-mlu\n" - "If you only have CPU, please change MLUPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) - .def("_type", &PlaceIndex) -#ifdef PADDLE_WITH_MLU - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("get_device_id", - [](const platform::MLUPlace &self) { return self.GetDeviceId(); }) -#endif - .def("__str__", string::to_string); - - py::class_ platformplace(m, "Place"); - g_place_pytype = reinterpret_cast(platformplace.ptr()); - platformplace.def(py::init<>()) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("is_gpu_place", - [](platform::Place &self) { return platform::is_gpu_place(self); }) - .def("is_cpu_place", - [](platform::Place &self) { return platform::is_cpu_place(self); }) - .def("is_xpu_place", - [](platform::Place &self) { return platform::is_xpu_place(self); }) - .def("is_npu_place", - [](platform::Place &self) { return platform::is_npu_place(self); }) - .def("is_ipu_place", - [](platform::Place &self) { return platform::is_ipu_place(self); }) - .def("is_cuda_pinned_place", - [](platform::Place &self) { - return platform::is_cuda_pinned_place(self); - }) - .def("is_mlu_place", - [](platform::Place &self) { return platform::is_mlu_place(self); }) - .def( - "is_custom_place", - [](platform::Place &self) { return platform::is_custom_place(self); }) - .def("gpu_device_id", [](platform::Place &self) { return self.device; }) - .def("xpu_device_id", [](platform::Place &self) { return self.device; }) - .def("npu_device_id", [](platform::Place &self) { return self.device; }) - .def("ipu_device_id", [](platform::Place &self) { return self.device; }) - .def("mlu_device_id", [](platform::Place &self) { return self.device; }) - .def("custom_device_id", - [](platform::Place &self) { return self.device; }) - .def("set_place", - [](platform::Place &self, const platform::Place &other) { - self = other; - }) - .def("set_place", - [](platform::Place &self, const platform::CPUPlace &cpu_place) { - self = cpu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::XPUPlace &xpu_place) { - self = xpu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::CUDAPlace &gpu_place) { - self = gpu_place; - }) - .def("set_place", - [](platform::Place &self, - const platform::CUDAPinnedPlace &cuda_pinned_place) { - self = cuda_pinned_place; - }) - .def("set_place", - [](platform::Place &self, const platform::NPUPlace &npu_place) { - self = npu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::IPUPlace &ipu_place) { - self = ipu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::MLUPlace &mlu_place) { - self = mlu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::CustomPlace &plug_place) { - self = plug_place; - }) - .def("__repr__", string::to_string) - .def("__str__", string::to_string); py::class_(m, "Operator") .def_static("create", @@ -3661,927 +2144,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("clear_executor_cache", []() { framework::ExecutorInfoCache::Instance().Finalize(); }); - using VarQuantScale = - std::unordered_map>; - - py::class_> pass(m, "Pass"); - pass.def(py::init()) - .def("has", &ir::Pass::Has) - .def("set_not_owned", - [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) { - self.SetNotOwned(attr_name, &attr); - }) - .def( - "set", - [](ir::Pass &self, const std::string &name, const std::string &attr) { - self.Set(name, new std::string(attr)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, bool val) { - self.Set(name, new bool(val)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, int val) { - self.Set(name, new int(val)); - }) - .def("set", - [](ir::Pass &self, - const std::string &name, - std::vector set) { - self.Set(name, new std::vector(set)); - }) - .def("set", - [](ir::Pass &self, - const std::string &name, - std::unordered_set set) { - self.Set(name, new std::unordered_set(set)); - }) - .def("set", - [](ir::Pass &self, - const std::string &name, - std::unordered_set set) { - self.Set(name, new std::unordered_set(set)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, VarQuantScale scales) { - self.Set(name, new VarQuantScale(scales)); - }) - .def("type", &ir::Pass::Type) - .def("apply", [](ir::Pass &self, std::shared_ptr graph) { - self.Apply(graph.get()); - }); - - py::class_> pb( - m, "PassBuilder"); - pb.def(py::init()) - .def("append_pass", - [](ir::PassBuilder &self, - const std::string &pass_type) -> std::shared_ptr { - return self.AppendPass(pass_type); - }) - .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); }) - .def("insert_pass", - [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) { - return self.InsertPass(idx, pass_type); - }) - .def("remove_pass", - [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); - - // -- python binds for parallel executor. - py::class_ pe(m, "ParallelExecutor"); - py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( - ExecutionStrategy allows the user to more preciously control how to run - the program in ParallelExecutor by setting the property. - - Returns: - ExecutionStrategy: An ExecutionStrategy object. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - import paddle.nn.functional as F - - paddle.enable_static() - - x = static.data(name='x', shape=[None, 13], dtype='float32') - y = static.data(name='y', shape=[None, 1], dtype='float32') - y_predict = static.nn.fc(input=x, size=1, act=None) - - cost = F.square_error_cost(input=y_predict, label=y) - avg_loss = paddle.mean(cost) - - sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(avg_loss) - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_threads = 4 - - train_exe = static.ParallelExecutor(use_cuda=False, - loss_name=avg_loss.name, - exec_strategy=exec_strategy) - )DOC"); - - py::enum_(m, "DeviceType", py::arithmetic()) - .value("CPU", paddle::platform::DeviceType::CPU) - .value("CUDA", paddle::platform::DeviceType::CUDA) - .value("XPU", paddle::platform::DeviceType::XPU); - - exec_strategy.def(py::init()) - .def_property( - "num_threads", - [](const ExecutionStrategy &self) { return self.num_threads_; }, - [](ExecutionStrategy &self, size_t num_threads) { - self.num_threads_ = num_threads; - }, - R"DOC( - The type is INT, num_threads represents the size of thread pool that - used to run the operators of the current program in ParallelExecutor. - If :math:`num\_threads=1`, all the operators will execute one by one, - but the order maybe difference between iterations. - If it is not set, it will be set in ParallelExecutor according to the - device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, - :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. - if it is not set, ParallelExecutor will get the cpu count by calling - `multiprocessing.cpu_count()`. Default 0. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_threads = 4 - )DOC") - .def_property( - "_use_device", - [](const ExecutionStrategy &self) { return self.use_device_; }, - [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) { - self.use_device_ = use_device; - }) // NOTE(liuyuhui): Doesn't add doc for 'use_device', because - // use_device isn‘t exposed to users. - .def_property( - "allow_op_delay", - [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, - [](ExecutionStrategy &self, bool allow_op_delay) { - self.allow_op_delay_ = allow_op_delay; - }, - R"DOC(The type is BOOL, allow_op_delay represents whether to delay the - communication operators to run, it may make the execution faster. - Note that this option is invalid now, and it will be removed in - next version. Default False.)DOC") - .def_property( - "num_iteration_per_drop_scope", - [](const ExecutionStrategy &self) { - return self.num_iteration_per_drop_scope_; - }, - [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { - self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; - }, - R"DOC(The type is INT, num_iteration_per_drop_scope indicates how - many iterations to clean up the temp variables which - is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. - Default 100. - - .. note:: - 1. If you fetch data when calling the 'run', the ParallelExecutor - will clean up the temp variables at the end of the current iteration. - 2. In some NLP model, it may cause the GPU memory is insufficient, - in this case, you should reduce `num_iteration_per_drop_scope`. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_iteration_per_drop_scope = 10 - )DOC") - .def_property( - "num_iteration_per_run", - [](const ExecutionStrategy &self) { - return self.num_iteration_per_run_; - }, - [](ExecutionStrategy &self, size_t num_iteration_per_run) { - self.num_iteration_per_run_ = num_iteration_per_run; - }, - R"DOC(This config that how many iteration the executor will run when - user call exe.run() in python。Default: 1. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_iteration_per_run = 10 - )DOC") - .def_property( - "use_thread_barrier", - [](const ExecutionStrategy &self) { return self.thread_barrier_; }, - [](ExecutionStrategy &self, bool use_thread_barrier) { - self.thread_barrier_ = use_thread_barrier; - }, - R"DOC(This config that the this is distributed training with parameter server - )DOC") - .def_property( - "_dry_run", - [](const ExecutionStrategy &self) { return self.dry_run_; }, - [](ExecutionStrategy &self, bool dry_run) { - self.dry_run_ = dry_run; - }); - - exec_strategy.def_property( - "use_experimental_executor", - [](const ExecutionStrategy &self) { - return self.type_ == ExecutionStrategy::kExperimental; - }, - [](ExecutionStrategy &self, bool experimental) { - self.type_ = experimental ? ExecutionStrategy::kExperimental - : ExecutionStrategy::kDefault; - }); - - py::class_ build_strategy(pe, "BuildStrategy", R"DOC( - BuildStrategy allows the user to more preciously control how to - build the SSA Graph in ParallelExecutor by setting the property. - - Returns: - BuildStrategy: An BuildStrategy object. - - Examples: - .. code-block:: python - - import os - import paddle - import paddle.static as static - - paddle.enable_static() - - os.environ['CPU_NUM'] = str(2) - places = static.cpu_places() - - data = static.data(name="x", shape=[None, 1], dtype="float32") - hidden = static.nn.fc(input=data, size=10) - loss = paddle.mean(hidden) - paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) - - build_strategy = static.BuildStrategy() - build_strategy.enable_inplace = True - build_strategy.memory_optimize = True - build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce - program = static.CompiledProgram(static.default_main_program()) - program = program.with_data_parallel(loss_name=loss.name, - build_strategy=build_strategy, - places=places) -)DOC"); - - py::enum_(build_strategy, "ReduceStrategy") - .value("Reduce", BuildStrategy::ReduceStrategy::kReduce) - .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce) - .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce); - py::enum_(build_strategy, - "GradientScaleStrategy") - .value("CoeffNumDevice", - BuildStrategy::GradientScaleStrategy::kCoeffNumDevice) - .value("One", BuildStrategy::GradientScaleStrategy::kOne) - .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized); - - build_strategy.def(py::init()) - .def("_clear_finalized", &BuildStrategy::ClearFinalized) - .def_property( - "reduce_strategy", - [](const BuildStrategy &self) { return self.reduce_; }, - [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.reduce_ = strategy; - }, - R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce - strategies in ParallelExecutor, AllReduce and Reduce. If you want - that all the parameters' optimization are done on all devices independently, - you should choose AllReduce; otherwise, if you choose Reduce, all the parameters' - optimization will be evenly distributed to different devices, and then - broadcast the optimized parameter to other devices. - Default is 'AllReduce'. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce - )DOC") - .def_property( - "gradient_scale_strategy", - [](const BuildStrategy &self) { return self.gradient_scale_; }, - [](BuildStrategy &self, - BuildStrategy::GradientScaleStrategy strategy) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.gradient_scale_ = strategy; - }, - R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three - ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice, - One and Customized. By default, ParallelExecutor sets the :math:`loss@grad` - according to the number of devices. If you want to customize :math:`loss@grad`, - you can choose Customized. Default is 'CoeffNumDevice'. - - Examples: - .. code-block:: python - - import numpy - import os - import paddle - import paddle.static as static - - paddle.enable_static() - - use_cuda = True - place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() - exe = static.Executor(place) - - # NOTE: If you use CPU to run the program, you need - # to specify the CPU_NUM, otherwise, paddle will use - # all the number of the logic core as the CPU_NUM, - # in that case, the batch size of the input should be - # greater than CPU_NUM, if not, the process will be - # failed by an exception. - if not use_cuda: - os.environ['CPU_NUM'] = str(2) - places = static.cpu_places() - else: - places = static.cuda_places() - - data = static.data(name='X', shape=[None, 1], dtype='float32') - hidden = static.nn.fc(input=data, size=10) - loss = paddle.mean(hidden) - paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) - - exe.run(static.default_startup_program()) - - build_strategy = static.BuildStrategy() - build_strategy.gradient_scale_strategy = \ - static.BuildStrategy.GradientScaleStrategy.Customized - compiled_prog = static.CompiledProgram( - static.default_main_program()).with_data_parallel( - loss_name=loss.name, build_strategy=build_strategy, - places=places) - - dev_count = len(places) - x = numpy.random.random(size=(10, 1)).astype('float32') - loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01 - loss_grad_name = loss.name+"@GRAD" - loss_data = exe.run(compiled_prog, - feed={"X": x, loss_grad_name : loss_grad}, - fetch_list=[loss.name, loss_grad_name]) - )DOC") - .def_property( - "debug_graphviz_path", - [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, - [](BuildStrategy &self, const std::string &path) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.debug_graphviz_path_ = path; - }, - R"DOC((str, optional): debug_graphviz_path indicates the path that - writing the SSA Graph to file in the form of graphviz. - It is useful for debugging. Default is empty string, that is, "" - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.debug_graphviz_path = "./graph" - )DOC") - .def_property( - "enable_sequential_execution", - [](const BuildStrategy &self) { - return self.enable_sequential_execution_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.enable_sequential_execution_ = b; - }, - R"DOC((bool, optional): If set True, the execution order of ops would - be the same as what is in the program. Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.enable_sequential_execution = True - )DOC") - .def_property( - "remove_unnecessary_lock", - [](const BuildStrategy &self) { - return self.remove_unnecessary_lock_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.remove_unnecessary_lock_ = b; - }, - R"DOC((bool, optional): If set True, some locks in GPU ops would be - released and ParallelExecutor would run faster. Default is True. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.remove_unnecessary_lock = True - )DOC") - .def_property( - "num_trainers", - [](const BuildStrategy &self) { return self.num_trainers_; }, - [](BuildStrategy &self, int num_trainers) { -#ifdef WIN32 - PADDLE_THROW(platform::errors::Unavailable( - "Distribution mode is not supported on Windows platform.")); -#endif - self.num_trainers_ = num_trainers; - }) - .def_property( - "trainers_endpoints", - [](const BuildStrategy &self) { return self.trainers_endpoints_; }, - [](BuildStrategy &self, - const std::vector &trainers_endpoints) { - self.trainers_endpoints_ = trainers_endpoints; - }) - .def_property( - "trainer_id", - [](const BuildStrategy &self) { return self.trainer_id_; }, - [](BuildStrategy &self, int trainer_id) { - self.trainer_id_ = trainer_id; - }) - .def_property( - "nccl_comm_num", - [](const BuildStrategy &self) { return self.nccl_comm_num_; }, - [](BuildStrategy &self, int nccl_comm_num) { - self.nccl_comm_num_ = nccl_comm_num; - }) - .def_property( - "bkcl_comm_num", - [](const BuildStrategy &self) { return self.bkcl_comm_num_; }, - [](BuildStrategy &self, int bkcl_comm_num) { - self.bkcl_comm_num_ = bkcl_comm_num; - }) - .def_property( - "use_hierarchical_allreduce", - [](const BuildStrategy &self) { - return self.use_hierarchical_allreduce_; - }, - [](BuildStrategy &self, bool use) { - self.use_hierarchical_allreduce_ = use; - }) - .def_property( - "hierarchical_allreduce_inter_nranks", - [](const BuildStrategy &self) { - return self.hierarchical_allreduce_inter_nranks_; - }, - [](BuildStrategy &self, int nranks) { - self.hierarchical_allreduce_inter_nranks_ = nranks; - }) - - .def_property( - "fuse_elewise_add_act_ops", - [](const BuildStrategy &self) { - return self.fuse_elewise_add_act_ops_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_elewise_add_act_ops_ = b; - }, - R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether - to fuse elementwise_add_op and activation_op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_elewise_add_act_ops = True - )DOC") - .def_property( - "fuse_gemm_epilogue", - [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_gemm_epilogue_ = b; - }, - R"DOC((bool, optional): fuse_gemm_epilogue indicate whether - to fuse matmul_op, elemenewist_add_op and activation_op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_gemm_epilogue = True - )DOC") - .def_property( - "fuse_bn_act_ops", - [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_bn_act_ops_ = b; - }, - R"DOC((bool, optional): fuse_bn_act_ops indicate whether - to fuse batch_norm and activation_op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_bn_act_ops = True - )DOC") - .def_property( - "fuse_bn_add_act_ops", - [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_bn_add_act_ops_ = b; - }, - R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether - to fuse batch_norm, elementwise_add and activation_op, - it may make the execution faster. Default is True - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_bn_add_act_ops = True - )DOC") - .def_property( - "enable_auto_fusion", - [](const BuildStrategy &self) { return self.enable_auto_fusion_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.enable_auto_fusion_ = b; - }, - R"DOC((bool, optional): Whether to enable fusing subgraph to a - fusion_group. Now we only support fusing subgraph that composed - of elementwise-like operators, such as elementwise_add/mul - without broadcast and activations. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.enable_auto_fusion = True - )DOC") - .def_property( - "fuse_relu_depthwise_conv", - [](const BuildStrategy &self) { - return self.fuse_relu_depthwise_conv_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_relu_depthwise_conv_ = b; - }, - R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether - to fuse relu and depthwise_conv2d, - it will save GPU memory and may make the execution faster. - This options is only available in GPU devices. - Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_relu_depthwise_conv = True - )DOC") - .def_property( - "fuse_broadcast_ops", - [](const BuildStrategy &self) { - return self.fuse_broadcast_ops_ == true || - self.fuse_broadcast_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, " - "cannot be configured again.")); - self.fuse_broadcast_ops_ = b; - }, - R"DOC((bool, optional): fuse_broadcast_op indicates whether - to fuse the broadcast ops. Note that, in Reduce mode, - fusing broadcast ops may make the program faster. Because - fusing broadcast OP equals delaying the execution of all - broadcast Ops, in this case, all nccl streams are used only - for NCCLReduce operations for a period of time. Default False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_broadcast_ops = True - )DOC") - .def_property( - "fuse_all_optimizer_ops", - [](const BuildStrategy &self) { - return self.fuse_all_optimizer_ops_ == true || - self.fuse_all_optimizer_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, " - "cannot be configured again.")); - self.fuse_all_optimizer_ops_ = b; - }) - .def_property( - "sync_batch_norm", - [](const BuildStrategy &self) { return self.sync_batch_norm_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.sync_batch_norm_ = b; - }, - R"DOC((bool, optional): sync_batch_norm indicates whether to use - synchronous batch normalization which synchronizes the mean - and variance through multi-devices in training phase. - Current implementation doesn't support FP16 training and CPU. - And only synchronous on one machine, not all machines. - Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.sync_batch_norm = True - )DOC") - .def_property( - "memory_optimize", - [](const BuildStrategy &self) -> py::object { - if (self.memory_optimize_) { - return py::cast(self.memory_optimize_.get()); - } else { - return py::cast(nullptr); - } - }, - [](BuildStrategy &self, const py::handle &value) { - auto *py_obj = value.ptr(); - if (py_obj == nullptr || py_obj == Py_None) { - self.memory_optimize_ = paddle::none; - } else if (PyBool_Check(py_obj)) { - self.memory_optimize_ = (py_obj == Py_True); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "BuildStrategy.memory_optimize must be set to None, False " - "or True")); - } - }, - R"DOC((bool, optional): memory opitimize aims to save total memory - consumption, set to True to enable it. - - Default None. None means framework would choose to use or not use - this strategy automatically. Currently, None means that it is - enabled when GC is disabled, and disabled when GC is enabled. - True means enabling and False means disabling. Default is None. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.memory_optimize = True - - )DOC") - .def_property( - "is_distribution", - [](const BuildStrategy &self) { return self.is_distribution_; }, - [](BuildStrategy &self, bool b) { -#ifdef WIN32 - if (b) { - PADDLE_THROW(platform::errors::Unavailable( - "Distribution mode is not supported on Windows platform.")); - } -#else - self.is_distribution_ = b; -#endif - }) - .def_property( - "async_mode", - [](const BuildStrategy &self) { return self.async_mode_; }, - [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) - .def_property( - "enable_inplace", - [](const BuildStrategy &self) { return self.enable_inplace_; }, - [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) - .def_property( - "enable_addto", - [](const BuildStrategy &self) { return self.enable_addto_; }, - [](BuildStrategy &self, bool b) { self.enable_addto_ = b; }) - .def_property( - "fuse_all_reduce_ops", - [](const BuildStrategy &self) { - return self.fuse_all_reduce_ops_ == true || - self.fuse_all_reduce_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) - .def_property( - "enable_backward_optimizer_op_deps", - [](const BuildStrategy &self) { - return self.enable_backward_optimizer_op_deps_; - }, - [](BuildStrategy &self, bool b) { - self.enable_backward_optimizer_op_deps_ = b; - }) - .def_property( - "cache_runtime_context", - [](const BuildStrategy &self) { return self.cache_runtime_context_; }, - [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; }) - .def_property( - "mkldnn_enabled_op_types", - [](const BuildStrategy &self) { - return self.mkldnn_enabled_op_types_; - }, - [](BuildStrategy &self, - const std::unordered_set &mkldnn_enabled_op_types) { - self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types; - }) - .def_property( - "fix_op_run_order", - [](const BuildStrategy &self) { return self.fix_op_run_order_; }, - [](BuildStrategy &self, bool fix_op_run_order) { - self.fix_op_run_order_ = fix_op_run_order; - }) - .def_property( - "allow_cuda_graph_capture", - [](const BuildStrategy &self) { - return self.allow_cuda_graph_capture_; - }, - [](BuildStrategy &self, bool allow_cuda_graph_capture) { - self.allow_cuda_graph_capture_ = allow_cuda_graph_capture; - }) - .def("_copy", - [](const BuildStrategy &self) { - auto new_bs = self; - new_bs.ClearFinalized(); - return new_bs; - }) - .def( - "_finalize_strategy_and_create_passes", - [](BuildStrategy &self) -> std::shared_ptr { - return self.CreatePassesFromStrategy(true); - }, - R"DOC(Allow user to customized passes. Normally model-specific - optimization passes should be defined in this way. BuildStrategy - cannot be updated after being finalized.)DOC"); - - m.def("_set_cached_executor_build_strategy", - [](int64_t program_id, const BuildStrategy &build_strategy) { - auto &cached_exe_info = framework::ExecutorInfoCache::Instance(); - cached_exe_info.SetBuildStrategy(program_id, build_strategy); - }); - - pe.def(py::init &, - const std::vector &, - const std::string &, - Scope *, - std::vector &, - const ExecutionStrategy &, - const BuildStrategy &, - ir::Graph *>()) - // NOTE: even we return a vec* to Python use reference policy. - // We still cannot get local_scope from this vector, since the element - // of vec will be freed by Python GC. We can only return Scope* - // one by one and mark them as reference. - .def( - "local_scopes", - [](ParallelExecutor &self) -> std::vector * { - return &self.GetLocalScopes(); - }, - py::return_value_policy::reference) - .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes) - .def("_need_create_local_exe_scopes", - &ParallelExecutor::NeedCreateLocalExeScope) - .def("feed_tensors_into_local_scopes", - &ParallelExecutor::FeedTensorsIntoLocalScopes) - .def("feed_and_split_tensor_into_local_scopes", - &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes) - .def("run", - [](ParallelExecutor &self, - const std::vector &fetch_tensors, - bool return_merged) -> py::object { - if (return_merged) { - paddle::framework::FetchList ret; - /*gil_scoped_release*/ { - pybind11::gil_scoped_release release; - ret = self.RunAndMerge(fetch_tensors); - } - return py::cast(std::move(ret)); - } else { - paddle::framework::FetchUnmergedList ret; - /*gil_scoped_release*/ { - pybind11::gil_scoped_release release; - ret = self.Run(fetch_tensors); - } - return py::cast(std::move(ret)); - } - }) - .def("device_count", &ParallelExecutor::DeviceCount); - #ifdef PADDLE_WITH_IPU py::class_>( @@ -4790,6 +2352,9 @@ All parameter, weight, gradient are variables in Paddle. BindFleetWrapper(&m); BindIO(&m); + BindParallelExecutor(m); + BindPlace(m); + BindTensor(m); #if defined(PADDLE_WITH_PSLIB) && !defined(PADDLE_WITH_HETERPS) BindHeterWrapper(&m); diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc new file mode 100644 index 0000000000..6ee72e0c16 --- /dev/null +++ b/paddle/fluid/pybind/tensor.cc @@ -0,0 +1,1106 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include +#include +#include +#include +#include +#include +#include // NOLINT // for call_once +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_cache.h" +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" +#include "paddle/fluid/framework/ir/cost_model.h" +#include "paddle/fluid/framework/ir/generate_pass.h" +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/new_executor/executor_statistics.h" +#include "paddle/fluid/framework/new_executor/standalone_executor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/phi_utils.h" +#include "paddle/fluid/framework/prune.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/save_load_util.h" +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/selected_rows_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/trainer.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/memory/allocation/allocator_strategy.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#endif +#include "paddle/fluid/memory/allocation/mmap_allocator.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/profiler.h" +#include "paddle/fluid/pybind/cuda_streams_py.h" +#include "paddle/fluid/pybind/distributed_py.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/io.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/lod_utils.h" +#include "paddle/utils/none.h" +#ifdef PADDLE_WITH_ASCEND +#include "paddle/fluid/pybind/ascend_wrapper_py.h" +#endif +#include "paddle/fluid/pybind/bind_cost_model.h" +#include "paddle/fluid/pybind/bind_fleet_executor.h" +#include "paddle/fluid/pybind/box_helper_py.h" +#include "paddle/fluid/pybind/communication.h" +#include "paddle/fluid/pybind/compatible.h" +#include "paddle/fluid/pybind/const_value.h" +#include "paddle/fluid/pybind/data_set_py.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/fleet_wrapper_py.h" +#include "paddle/fluid/pybind/generator_py.h" +#include "paddle/fluid/pybind/global_value_getter_setter.h" +#include "paddle/fluid/pybind/gloo_context_py.h" +#include "paddle/fluid/pybind/gloo_wrapper_py.h" +#include "paddle/fluid/pybind/heter_wrapper_py.h" +#include "paddle/fluid/pybind/inference_api.h" +#include "paddle/fluid/pybind/ir.h" +#include "paddle/fluid/pybind/metrics_py.h" +#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/phi/backends/device_manager.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/pybind/nccl_wrapper_py.h" +#endif +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/pybind/protobuf.h" +#include "paddle/fluid/pybind/pybind.h" // NOLINT +#include "paddle/fluid/pybind/reader_py.h" +#include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/fluid/string/to_string.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#ifndef PADDLE_WITH_HIP +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" +#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#endif + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" +#endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/capi/capi.h" +#endif + +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif + +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif + +#ifdef PADDLE_WITH_CRYPTO +#include "paddle/fluid/pybind/crypto.h" +#endif + +#if defined PADDLE_WITH_PSCORE +#include "paddle/fluid/pybind/fleet_py.h" +#endif + +#ifdef PADDLE_WITH_CINN +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#endif + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/imperative/layout_autotune.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/tensor.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" +#include "pybind11/stl.h" + +DECLARE_bool(use_mkldnn); + +// disable auto conversion to list in Python +PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); + +namespace paddle { +namespace pybind { + +PyTypeObject *g_framework_tensor_pytype = nullptr; + +template +static void TensorCopyFrom(framework::Tensor *dst, + const framework::Tensor &src, + const PlaceType &place, + int64_t batch_size) { + if (batch_size < 0) { + framework::TensorCopy(src, place, dst); + } else { + auto sliced = src.Slice(0, batch_size); + framework::TensorCopy(sliced, place, dst); + } +} + +void BindTensor(pybind11::module &m) { // NOLINT + using namespace paddle::framework; // NOLINT + py::class_ framework_tensor( + m, "Tensor", py::buffer_protocol()); + g_framework_tensor_pytype = + reinterpret_cast(framework_tensor.ptr()); + framework_tensor + .def("__array__", + [](framework::Tensor &self) { return TensorToPyArray(self); }) + .def("_ptr", + [](const framework::Tensor &self) { + return reinterpret_cast(self.data()); + }) + .def("_slice", &framework::Tensor::Slice) + .def("_numel", &framework::Tensor::numel) + .def("_is_initialized", + [](const framework::Tensor &self) { return self.IsInitialized(); }) + .def("_get_dims", + [](const framework::Tensor &self) { return vectorize(self.dims()); }) + .def("_set_dims", + [](framework::Tensor &self, const std::vector &dim) { + self.Resize(phi::make_ddim(dim)); + }) + .def("_set_layout", + [](framework::Tensor &self, const std::string &layout) { + self.set_layout(StringToDataLayout(layout)); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::CustomPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::CUDAPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::XPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::NPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::MLUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_double", + [](framework::Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::CustomPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::XPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::CUDAPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::MLUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, + paddle::platform::CUDAPinnedPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, + paddle::platform::CUDAPinnedPlace &place) { + self.mutable_data(place); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::CPUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::CustomPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::XPUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::CUDAPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::CUDAPinnedPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::MLUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_clear", &framework::Tensor::clear) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::NPUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false, + R"DOC( + Set the data of Tensor on place with given numpy array. + + Args: + lod (numpy.ndarray): The data to set. + place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the + Tensor is to be set. + zero_copy (bool, optional): Whether to share memory with the input numpy array. + This parameter only works with CPUPlace. Default: False. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + )DOC") + + .def( + "shape", + [](framework::Tensor &self) { return vectorize(self.dims()); }, + R"DOC( + Return the shape of Tensor. + + Returns: + list[int]: The shape of Tensor. + + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + print(t.shape()) # [5, 30] + )DOC") + .def("_to_dlpack", + [](framework::Tensor &self) { + DLPackTensor dlpack_tensor(self, 1); + DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor(); + auto capsule = py::capsule( + static_cast(dmt), "dltensor", [](PyObject *ptr) { + if (ptr) { + auto dltensor = new DLManagedTensor; + try { + dltensor = reinterpret_cast( + PyCapsule_GetPointer(ptr, "used_dltensor")); + return; + } catch (...) { + dltensor = reinterpret_cast( + PyCapsule_GetPointer(ptr, "dltensor")); + } + dltensor->deleter(dltensor); + } + }); + return capsule; + }) + .def("_set_float_element", TensorSetElement) + .def("_get_float_element", TensorGetElement) + .def("_set_double_element", TensorSetElement) + .def("_get_double_element", TensorGetElement) + .def("_place", [](framework::Tensor &self) { return self.place(); }) + .def("_dtype", + [](framework::Tensor &self) { + return framework::TransToProtoVarType(self.type()); + }) + .def("_layout", + [](framework::Tensor &self) { + return DataLayoutToString(self.layout()); + }) + .def("_share_data_with", &framework::Tensor::ShareDataWith) + .def("__getitem__", PySliceTensor, py::return_value_policy::reference) + .def("__str__", + [](const framework::Tensor &self) { + std::stringstream ostr; + ostr << self; + return ostr.str(); + }) /* ------ End of original Tensor ------ */ + .def("__init__", + [](framework::Tensor &instance, + const std::vector> + &recursive_sequence_lengths) { + LoD new_lod; + new_lod.reserve(recursive_sequence_lengths.size()); + std::copy(recursive_sequence_lengths.begin(), + recursive_sequence_lengths.end(), + std::back_inserter(new_lod)); + LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); + PADDLE_ENFORCE_EQ( + CheckLoD(new_offset_lod, -1), + true, + platform::errors::InvalidArgument( + "The provided recursive_sequence_lengths info is " + "invalid, " + "the LoD converted by recursive_sequence_lengths is %s", + new_lod)); + new (&instance) framework::Tensor(new_offset_lod); + }) + .def("__init__", + [](framework::Tensor &instance) { + new (&instance) framework::Tensor(); + }) + // We implement offset based LOD in C++ while we use length based with + // Python API. So we changed set_lod to set_recursive_sequence_lengths + // to + // avoid misuse. + // The discussion is here: + // https://github.com/PaddlePaddle/Paddle/issues/10855 + .def( + "set_lod", + [](framework::Tensor &self, + const std::vector> &lod) { + // the input lod is offset-based level-of-detail info + LoD new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + PADDLE_ENFORCE_EQ( + CheckLoD(new_lod, vectorize(self.dims()).front()), + true, + platform::errors::InvalidArgument( + "The provided LoD is invalid, the LoD is %s", new_lod)); + self.set_lod(new_lod); + }, + py::arg("lod"), + R"DOC( + Set LoD of the Tensor. + + Args: + lod (list[list[int]]): The lod to set. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_lod([[0, 2, 5]]) + print(t.lod()) # [[0, 2, 5]] + )DOC") + .def( + "set_recursive_sequence_lengths", + [](framework::Tensor &self, + const std::vector> + &recursive_sequence_lengths) { + // the input recursive_sequence_lengths is length-based + // level-of-detail info + LoD new_lod; + new_lod.reserve(recursive_sequence_lengths.size()); + std::copy(recursive_sequence_lengths.begin(), + recursive_sequence_lengths.end(), + std::back_inserter(new_lod)); + LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); + PADDLE_ENFORCE_EQ( + CheckLoD(new_offset_lod, vectorize(self.dims()).front()), + true, + platform::errors::InvalidArgument( + "The provided recursive_sequence_lengths info is " + "invalid, " + "the LoD converted by recursive_sequence_lengths is " + "%s", + new_lod)); + self.set_lod(new_offset_lod); + }, + py::arg("recursive_sequence_lengths"), + R"DOC( + Set LoD of the Tensor according to recursive sequence lengths. + + For example, if recursive_sequence_lengths=[[2, 3]], which means + there are two sequences with length 2 and 3 respectively, the + corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]]. + + Args: + recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_recursive_sequence_lengths([[2, 3]]) + print(t.recursive_sequence_lengths()) # [[2, 3]] + print(t.lod()) # [[0, 2, 5]] + )DOC") + .def( + "lod", + [](framework::Tensor &self) -> std::vector> { + // output the offset-based lod info + LoD lod = self.lod(); + std::vector> new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + return new_lod; + }, + R"DOC( + Return the LoD of the Tensor. + + Returns: + list[list[int]]: The lod of the Tensor. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_lod([[0, 2, 5]]) + print(t.lod()) # [[0, 2, 5]] + )DOC") + // Set above comments of set_lod. + .def( + "recursive_sequence_lengths", + [](framework::Tensor &self) -> std::vector> { + // output the length-based lod info + LoD lod = phi::ConvertToLengthBasedLoD(self.lod()); + std::vector> new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + return new_lod; + }, + R"DOC( + Return the recursive sequence lengths corresponding to of the LodD + of the Tensor. + + Returns: + list[list[int]]: The recursive sequence lengths. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_recursive_sequence_lengths([[2, 3]]) + print(t.recursive_sequence_lengths()) # [[2, 3]] + )DOC") + .def( + "has_valid_recursive_sequence_lengths", + [](framework::Tensor &self) -> bool { + // Check that the lod info is valid and match the outermost + // dimension of the Tensor data + return CheckLoD(self.lod(), vectorize(self.dims()).front()); + }, + R"DOC( + Check whether the LoD of the Tensor is valid. + + Returns: + bool: Whether the LoD is valid. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_recursive_sequence_lengths([[2, 3]]) + print(t.has_valid_recursive_sequence_lengths()) # True + )DOC") + .def("_as_type", + [](const framework::Tensor &self, + paddle::framework::proto::VarType::Type type) { + framework::Tensor dst; + if (self.IsInitialized() && self.numel() > 0) { + TransDataType(self, type, &dst); + } + return dst; + }) + .def("_copy", + [](const framework::Tensor &self, const platform::Place &place) { + // follow fetch_op's inplementation + framework::Tensor dst; + if (self.IsInitialized() && self.numel() > 0) { + TensorCopySync(self, place, &dst); + } else { + // Not copy, if the src tensor is empty. + dst.clear(); + dst.Resize({0}); + } + dst.set_lod(self.lod()); + return dst; +#ifdef _WIN32 + }); +#else + }) +#ifdef PADDLE_WITH_CUDA + .def("_share_buffer_with", + [](framework::Tensor &self, const framework::Tensor src, + py::tuple t) { + auto *cuda_ipc_allocation = + dynamic_cast( + src.Holder().get()); + + PADDLE_ENFORCE_NOT_NULL( + cuda_ipc_allocation, + platform::errors::PreconditionNotMet( + "Tensor is not Cuda IPC shared tensor. " + "Now only Tensor shared by cuda ipc could use this " + "api.")); + + size_t size = t[0].cast(); + auto dtype = + static_cast(t[1].cast()); + auto dims = phi::make_ddim(t[2].cast>()); + auto lod_info = t[3].cast(); + auto device_id = t[4].cast(); + + auto shared_reader_holder = + std::make_shared( + cuda_ipc_allocation->ptr(), + cuda_ipc_allocation->base_ptr(), size, + platform::CUDAPlace(device_id)); + + self.ResetHolderWithType(shared_reader_holder, dtype); + self.Resize(dims); + self.set_lod(lod_info); + + VLOG(6) << "Reconstructed tensor with buffer shared!"; + }, + R"DOC( + Deserialize GPU Tensor for existed shared Cuda IPC tensor. + + Params: + tensor: Shared Cuda IPC tensor. + tuple: contrains data size, data type, + tensor dims, lod information, device index. + + )DOC") + .def("_share_cuda", + [](framework::Tensor self) { + if (!self.IsInitialized() || self.numel() == 0) + throw std::runtime_error( + "Tensor not initialized or numel is 0. could not pass " + "to shared memory. "); + + auto *holder = dynamic_cast( + self.Holder().get()); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(holder->place()), true, + platform::errors::InvalidArgument( + "Tensor is not on GPU. share_cuda only support GPU " + "Tensor, share_filename is for CPU tensor.")); + + void *base_ptr = holder->base_ptr(); + ptrdiff_t offset_bytes = reinterpret_cast(holder->ptr()) - + reinterpret_cast(base_ptr); + + cudaIpcMemHandle_t handle; + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr)); + + auto _handle = py::bytes(reinterpret_cast(&handle), + (py::ssize_t)CUDA_IPC_HANDLE_SIZE); + + // TODO(ZHUI): use cuda event, to avoid sync. + const auto &device_id = paddle::platform::GetCurrentDeviceId(); + auto stream = + paddle::platform::stream::get_current_stream(device_id); + stream->Synchronize(); + + int type_idx = static_cast(self.type()); + size_t data_size = + self.numel() * + framework::SizeOfType( + framework::TransToProtoVarType(self.type())); + + return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size, + type_idx, vectorize(self.dims()), self.lod(), + device_id); + }, + R"DOC( + Serialize GPU Tensor by cudaIpcMemHandle. + + Returns: + tuple: contrains handle, data size, data type, + tensor dims, lod information, device index. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_cuda() + + )DOC") + .def("_new_shared_cuda", + [](py::tuple t) { + if (t.size() != 7) + throw std::runtime_error( + "Invalid Tensor meta info for shared cuda tensor!"); + + // 1. Create a new C++ instance + framework::Tensor tensor; + + // 2. Rebuild Allocation from handle + const std::string &handle = t[0].cast(); + ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast(); + auto device_id = t[6].cast(); + auto base_ptr = memory::allocation::GetIpcBasePtr(handle); + size_t size = t[2].cast(); + void *dev = base_ptr.get(); + dev = reinterpret_cast(dev) + offset_bytes; + + auto shared_reader_holder = + std::make_shared( + dev, size, device_id, std::move(base_ptr)); + + // 3. Rebuild Tensor + tensor.ResetHolderWithType( + shared_reader_holder, + static_cast(t[3].cast())); + tensor.Resize(phi::make_ddim(t[4].cast>())); + tensor.set_lod(t[5].cast()); + + return tensor; + }, + R"DOC( + Deserialize GPU lod tensor from cudaIpcMemHandle. + + Params: + tuple: contrains handle, data size, data type, + tensor dims, lod information, device index. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_cuda() + tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo)) + + )DOC") +#endif + .def("_share_filename", + [](framework::Tensor &self) { + if (!self.IsInitialized() || self.numel() == 0) + throw std::runtime_error( + "Tensor not initialized or numel is 0. could not pass to " + "shared memory. "); + + auto holder = self.Holder(); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(holder->place()) || + platform::is_cuda_pinned_place(holder->place()), + true, platform::errors::InvalidArgument( + "Tensor is not on CPU. share_filename only " + "support CPU Tensor.")); + + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + holder.get()); + // If the tensor is not shared, allocate memory map allocation. + if (mmap_allocation == nullptr) { + void *data_ptr = self.data(); + size_t data_size = + self.numel() * + framework::SizeOfType( + framework::TransToProtoVarType(self.type())); + + int flags = memory::allocation::MAPPED_SHAREDMEM | + memory::allocation::MAPPED_EXCLUSIVE; + std::string handle = memory::allocation::GetIPCName(); + auto shared_holder = + memory::allocation::AllocateRefcountedMemoryMapAllocation( + handle, flags, data_size); + + // copy data & reset holder + if (platform::is_cuda_pinned_place(holder->place())) { +#ifdef PADDLE_WITH_CUDA + memory::Copy(platform::CPUPlace(), shared_holder->ptr(), + platform::CUDAPinnedPlace(), data_ptr, data_size); +#endif + } else { + memory::Copy(platform::CPUPlace(), shared_holder->ptr(), + platform::CPUPlace(), data_ptr, data_size); + } + self.ResetHolder(shared_holder); + mmap_allocation = shared_holder.get(); + } + int type_idx = static_cast(self.type()); + + return py::make_tuple(mmap_allocation->ipc_name(), + mmap_allocation->size(), type_idx, + vectorize(self.dims()), self.lod()); + }, + R"DOC( + Serialize CPU lod tensor in shared memory to tuple. + If the tensor is not in shared memory, we will copy it first. + + Returns: + tuple: contrains ipc name, data size, data type, + tensor dims and lod imformation. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_filename() + + )DOC") + .def("_new_shared_filename", + [](py::tuple t) { // __setstate__ + if (t.size() != 5) + throw std::runtime_error("Invalid Tensor meta info state!"); + + framework::Tensor tensor; + + // 2. Rebuild Allocation + const std::string &ipc_name = t[0].cast(); + size_t size = t[1].cast(); + int flags = memory::allocation::MAPPED_SHAREDMEM | + memory::allocation::MAPPED_NOCREATE; + + auto shared_holder = + memory::allocation::AllocateRefcountedMemoryMapAllocation( + ipc_name, flags, size); + + // 3. Rebuild Tensor + tensor.ResetHolderWithType( + shared_holder, + static_cast(t[2].cast())); + tensor.Resize(phi::make_ddim(t[3].cast>())); + tensor.set_lod(t[4].cast()); + + return tensor; + }, + R"DOC( + Deserialize CPU lod tensor from shared memory. + + Params: + tuple: contrains ipc file name, data size, data type, + tensor dims and lod information. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_filename() + tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo)) + + )DOC") + .def("_shared_incref", + [](framework::Tensor &self) { + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + self.Holder().get()); + if (mmap_allocation) { + mmap_allocation->incref(); + } + }, + R"DOC( + Increase reference count of share_filename tensor. + )DOC") + .def("_shared_decref", + [](framework::Tensor &self) { + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + self.Holder().get()); + if (mmap_allocation) { + mmap_allocation->decref(); + } + }, + R"DOC( + Decrease reference count of share_filename tensor. + )DOC") + .def(py::pickle( + [](const framework::Tensor &t) { // __getstate__ + auto holder = t.Holder(); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true, + platform::errors::PreconditionNotMet( + "Tensor is not on CPU." + "Now only Tensor on CPU can be serialized.")); + auto *mmap_writer_allocation = + dynamic_cast( + holder.get()); + PADDLE_ENFORCE_NOT_NULL( + mmap_writer_allocation, + platform::errors::PreconditionNotMet( + "Tensor is not in shared memory." + "Now only Tensor on shared memory can be serialized.")); + int type_idx = static_cast(t.type()); + + return py::make_tuple(mmap_writer_allocation->ipc_name(), + mmap_writer_allocation->size(), type_idx, + vectorize(t.dims()), t.lod()); + }, + [](py::tuple t) { // __setstate__ + if (t.size() != 5) + throw std::runtime_error("Invalid Tensor state!"); + + // 1. Create a new C++ instance + framework::Tensor tensor; + + // 2. Rebuild Allocation + const std::string &ipc_name = t[0].cast(); + size_t size = t[1].cast(); + auto shared_reader_holder = + memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name, + size); + + // 3. Maintain global fd set + VLOG(3) << "Tensor ipc name: " << ipc_name; + memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); + + // 4. Rebuild Tensor + tensor.ResetHolderWithType( + shared_reader_holder, + static_cast(t[2].cast())); + tensor.Resize(phi::make_ddim(t[3].cast>())); + tensor.set_lod(t[4].cast()); + + return tensor; + })); +#endif + + py::class_(m, "SelectedRows") + .def("__init__", + [](phi::SelectedRows &instance) { + new (&instance) phi::SelectedRows(); + }) + .def("__init__", + [](phi::SelectedRows &instance, + const std::vector rows, + const int64_t &height) { + new (&instance) phi::SelectedRows(rows, height); + }) + .def( + "get_tensor", + [](phi::SelectedRows &self) { return self.mutable_value(); }, + py::return_value_policy::reference) + .def("numel", + [](phi::SelectedRows &self) -> int64_t { + return self.value().numel(); + }) + .def("set_height", &phi::SelectedRows::set_height) + .def("height", &phi::SelectedRows::height) + .def("set_rows", + [](phi::SelectedRows &self, std::vector rows) { +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) + self.set_rows(rows); +#else + Vector new_rows(rows); + self.set_rows(new_rows); +#endif + }) + .def("sync_index", + [](phi::SelectedRows &instance) { instance.SyncIndex(); }) + .def("rows", [](phi::SelectedRows &self) { + auto rows = self.rows(); + std::vector new_rows; + new_rows.reserve(rows.size()); + std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows)); + return new_rows; + }); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/tensor.h b/paddle/fluid/pybind/tensor.h new file mode 100644 index 0000000000..a21236724b --- /dev/null +++ b/paddle/fluid/pybind/tensor.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" + +namespace paddle { +namespace pybind { + +void BindTensor(pybind11::module& m); // NOLINT + +} // namespace pybind +} // namespace paddle -- GitLab