未验证 提交 4baf0dbe 编写于 作者: W wanghuancoder 提交者: GitHub

Compilation optimization (#44242)

* Compilation optimization
上级 e9b4d0be
......@@ -6,7 +6,7 @@ cc_library(
if(NOT (NOT WITH_PYTHON AND ON_INFER))
cc_library(
final_dygraph_node
SRCS nodes.cc
DEPS ${eager_deps} ${eager_manual_nodes})
SRCS nodes.cc ${eager_manual_nodes}
DEPS ${eager_deps})
add_dependencies(final_dygraph_node eager_final_state_codegen)
endif()
......@@ -6,7 +6,7 @@ cc_library(
if(NOT (NOT WITH_PYTHON AND ON_INFER))
cc_library(
final_dygraph_function
SRCS dygraph_functions.cc
DEPS ${eager_deps} ${eager_manual_functions})
SRCS dygraph_functions.cc ${eager_manual_functions}
DEPS ${eager_deps})
add_dependencies(final_dygraph_function eager_final_state_codegen)
endif()
cc_library(
add_n_fwd_func
SRCS add_n_fwd_func.cc
DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
add_dependencies(add_n_fwd_func eager_codegen)
cc_library(
conv2d_fwd_function
SRCS conv2d_fwd_function.cc
DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
add_dependencies(conv2d_fwd_function eager_codegen)
set(eager_manual_functions
conv2d_fwd_function add_n_fwd_func
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
PARENT_SCOPE)
cc_library(
add_n_node
SRCS add_n_node.cc
DEPS ${eager_deps} ${fluid_deps})
cc_library(
conv2d_nodes
SRCS conv2d_nodes.cc
DEPS ${eager_deps} ${fluid_deps})
set(eager_manual_nodes
conv2d_nodes add_n_node
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
PARENT_SCOPE)
cc_library(
fused_gate_attention_fwd_func
SRCS fused_gate_attention_fwd_func.cc
DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
add_dependencies(fused_gate_attention_fwd_func eager_codegen
copy_dygraph_forward_functions)
cc_library(
fused_feedforward_fwd_func
SRCS fused_feedforward_fwd_func.cc
DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
add_dependencies(fused_feedforward_fwd_func eager_codegen
copy_dygraph_forward_functions)
cc_library(
fused_attention_fwd_func
SRCS fused_attention_fwd_func.cc
DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
add_dependencies(fused_attention_fwd_func eager_codegen
copy_dygraph_forward_functions)
set(fluid_manual_functions
fused_gate_attention_fwd_func fused_feedforward_fwd_func
fused_attention_fwd_func
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
PARENT_SCOPE)
cc_library(
fused_gate_attention_node
SRCS fused_gate_attention_node.cc
DEPS ${eager_deps} ${fluid_deps})
cc_library(
fused_feedforward_node
SRCS fused_feedforward_node.cc
DEPS ${eager_deps} ${fluid_deps})
cc_library(
fused_attention_node
SRCS fused_attention_node.cc
DEPS ${eager_deps} ${fluid_deps})
set(fluid_manual_nodes
fused_gate_attention_node fused_feedforward_node fused_attention_node
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc
PARENT_SCOPE)
......@@ -3083,27 +3083,44 @@ static std::string ConvertCoreOpsInfosToString(
return core_ops_returns_info_init_str;
}
static std::string GenerateCoreOpsReturnsInfo() {
static std::string GenerateCoreOpsArgsInfo() {
const char* Core_Ops_Returns_MAP_TEMPLATE =
"std::unordered_map<std::string, std::vector<std::string>> "
"core_ops_args_info = { %s };\n"
"std::unordered_map<std::string, std::vector<std::string>> "
"core_ops_args_type_info = { %s };\n"
"std::unordered_map<std::string, std::vector<std::string>> "
"core_ops_returns_info = { %s };\n";
"core_ops_args_info = { %s };\n";
std::string core_ops_args_info_init_str =
ConvertCoreOpsInfosToString(core_ops_args_info);
std::string core_ops_info_str = paddle::string::Sprintf(
Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_info_init_str);
return core_ops_info_str;
}
static std::string GenerateCoreOpsArgsTypeInfo() {
const char* Core_Ops_Returns_MAP_TEMPLATE =
"std::unordered_map<std::string, std::vector<std::string>> "
"core_ops_args_type_info = { %s };\n";
std::string core_ops_args_type_info_init_str =
ConvertCoreOpsInfosToString(core_ops_args_type_info);
std::string core_ops_info_str = paddle::string::Sprintf(
Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_type_info_init_str);
return core_ops_info_str;
}
static std::string GenerateCoreOpsReturnsInfo() {
const char* Core_Ops_Returns_MAP_TEMPLATE =
"std::unordered_map<std::string, std::vector<std::string>> "
"core_ops_returns_info = { %s };\n";
std::string core_ops_returns_info_init_str =
ConvertCoreOpsInfosToString(core_ops_returns_info);
std::string core_ops_info_str =
paddle::string::Sprintf(Core_Ops_Returns_MAP_TEMPLATE,
core_ops_args_info_init_str,
core_ops_args_type_info_init_str,
core_ops_returns_info_init_str);
std::string core_ops_info_str = paddle::string::Sprintf(
Core_Ops_Returns_MAP_TEMPLATE, core_ops_returns_info_init_str);
return core_ops_info_str;
}
......@@ -3252,6 +3269,12 @@ static void DygraphCodeGeneration(const std::string& output_dir,
GenerateForwardDygraphFile(
output_dir + "/forwards/dygraph_forward_functions_args_info.tmp.cc",
GenerateCoreOpsArgsInfo());
GenerateForwardDygraphFile(
output_dir + "/forwards/dygraph_forward_functions_args_type_info.tmp.cc",
GenerateCoreOpsArgsTypeInfo());
GenerateForwardDygraphFile(
output_dir + "/forwards/dygraph_forward_functions_returns_info.tmp.cc",
GenerateCoreOpsReturnsInfo());
VLOG(6) << "-------- GenerateNodeCCFile -------";
......
......@@ -96,6 +96,11 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
"nodes" + str(i + 1) + ".cc"))
empty_files.append(
os.path.join(forwards_dir, "dygraph_forward_functions_args_info.cc"))
empty_files.append(
os.path.join(forwards_dir,
"dygraph_forward_functions_args_type_info.cc"))
empty_files.append(
os.path.join(forwards_dir, "dygraph_forward_functions_returns_info.cc"))
for path in empty_files:
if not os.path.exists(path):
open(path, 'a').close()
......@@ -125,7 +130,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
f.write("cc_library(dygraph_node SRCS ")
for i in range(split_count):
f.write("nodes" + str(i + 1) + ".cc ")
f.write("DEPS ${eager_deps} ${fluid_deps} ${fluid_manual_nodes})\n")
f.write("${fluid_manual_nodes} DEPS ${eager_deps} ${fluid_deps})\n")
f.write("add_dependencies(dygraph_node copy_dygraph_node)")
with open(forwards_level_cmakelist_path, "w") as f:
......@@ -143,6 +148,12 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
f.write(
" COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.cc\"\n"
)
f.write(
" COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.cc\"\n"
)
f.write(
" COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.cc\"\n"
)
f.write(" DEPENDS eager_codegen\n")
f.write(" VERBATIM)\n")
......@@ -150,8 +161,10 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
for i in range(split_count):
f.write("dygraph_forward_functions" + str(i + 1) + ".cc ")
f.write("dygraph_forward_functions_args_info.cc ")
f.write("dygraph_forward_functions_args_type_info.cc ")
f.write("dygraph_forward_functions_returns_info.cc ")
f.write(
"DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${fluid_manual_functions})\n"
"${fluid_manual_functions} DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n"
)
f.write(
"add_dependencies(dygraph_function copy_dygraph_forward_functions)")
......
set(INTERPRETERCORE_DEPS
add_subdirectory(workqueue)
add_subdirectory(garbage_collector)
set(STANDALONE_EXECUTOR_SRCS
data_transfer.cc
new_executor_defs.cc
interpretercore_util.cc
event_manager.cc
stream_analyzer.cc
interpretercore.cc
standalone_executor.cc)
set(STANDALONE_EXECUTOR_DEPS
op_registry
device_context
scope
......@@ -20,62 +32,33 @@ set(INTERPRETERCORE_DEPS
variable_helper
timer
monitor
nan_inf_utils)
add_subdirectory(workqueue)
add_subdirectory(garbage_collector)
cc_library(
data_transfer
SRCS data_transfer.cc
DEPS enforce scope glog)
cc_library(
new_executor_defs
SRCS new_executor_defs.cc
DEPS enforce glog scope)
cc_library(
interpretercore_util
SRCS interpretercore_util.cc
DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer)
cc_library(
event_manager
SRCS event_manager.cc
DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs)
cc_library(
stream_analyzer
SRCS stream_analyzer.cc
DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs)
nan_inf_utils
enforce
scope
glog
enforce
glog
scope
workqueue
interpretercore_event_garbage_collector
${DEVICE_EVENT_LIBS}
glog)
if(WITH_GPU OR WITH_ROCM)
cc_library(
interpretercore
SRCS interpretercore.cc
DEPS workqueue
${DEVICE_EVENT_LIBS}
interpretercore_util
interpretercore_event_garbage_collector
interpretercore_fast_garbage_collector
stream_analyzer
event_manager)
else()
cc_library(
interpretercore
SRCS interpretercore.cc
DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util
interpretercore_event_garbage_collector stream_analyzer event_manager)
set(STANDALONE_EXECUTOR_DEPS ${STANDALONE_EXECUTOR_DEPS}
interpretercore_fast_garbage_collector)
endif()
cc_library(
standalone_executor
SRCS standalone_executor.cc
DEPS interpretercore)
SRCS ${STANDALONE_EXECUTOR_SRCS}
DEPS ${STANDALONE_EXECUTOR_DEPS})
cc_library(
staticgraph_executor_statistics
SRCS executor_statistics.cc
DEPS enforce glog os_info)
# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
# skip win32 since wget is not installed by default on windows machine.
if(WITH_GPU
AND WITH_TESTING
......@@ -120,13 +103,7 @@ if(WITH_GPU
cc_test(
standalone_executor_test
SRCS standalone_executor_test.cc
DEPS interpretercore
standalone_executor
operator
op_registry
executor
${OPS}
${OP_DEPS})
DEPS standalone_executor operator op_registry executor ${OPS} ${OP_DEPS})
set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100)
add_dependencies(standalone_executor_test download_program)
......
......@@ -5,7 +5,7 @@ cc_library(
cc_library(
var_helper
SRCS var_helper.cc
DEPS tensor phi_api)
DEPS tensor selected_rows)
if(WITH_XPU)
cc_library(
prepared_operator
......@@ -20,8 +20,8 @@ if(WITH_XPU)
op_kernel_type
data_transform
nan_inf_utils
phi_api
phi_utils
scalar
int_array
var_helper
profiler)
else()
......@@ -37,21 +37,16 @@ else()
op_kernel_type
data_transform
nan_inf_utils
phi_api
phi_utils
scalar
int_array
var_helper
profiler)
endif()
cc_library(
layer
SRCS layer.cc
DEPS prepared_operator
math_function
imperative_flag
variable_helper
op_registry
var_helper
phi_api)
DEPS prepared_operator math_function imperative_flag variable_helper
op_registry var_helper)
add_subdirectory(jit)
if(WITH_GPU)
cc_library(
......
......@@ -101,7 +101,7 @@ else()
cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
endif()
set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel)
set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta)
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
......
......@@ -10,4 +10,4 @@ nv_library(
nv_test(
cudnn_helper_test
SRCS cudnn_helper_test.cc
DEPS dynload_cuda phi)
DEPS dynload_cuda)
pybind.h
op_function.cc
op_function1.cc
op_function2.cc
op_function3.cc
op_function4.cc
op_function5.cc
op_function6.cc
op_function7.cc
op_function8.cc
eager_op_function.cc
eager_final_state_op_function.cc
......@@ -102,13 +102,16 @@ endif()
set(PYBIND_SRCS
pybind.cc
imperative.cc
op_function.cc
inference_api.cc
ir.cc
bind_fleet_executor.cc
reader_py.cc
protobuf.cc
exception.cc
op_function_common.cc
parallel_executor.cc
tensor.cc
place.cc
const_value.cc
global_value_getter_setter.cc
fleet_wrapper_py.cc
......@@ -124,13 +127,15 @@ set(PYBIND_SRCS
generator_py.cc
communication.cc
cuda_streams_py.cc
jit.cc)
execute_process(
COMMAND
"${PYTHON_EXECUTABLE}"
"${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py"
"${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/")
jit.cc
op_function1.cc
op_function2.cc
op_function3.cc
op_function4.cc
op_function5.cc
op_function6.cc
op_function7.cc
op_function8.cc)
if(WITH_CUSTOM_DEVICE)
set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
......@@ -267,12 +272,35 @@ if(WITH_PYTHON)
target_link_libraries(kernel_signature_generator ${ROCM_HIPRTC_LIB})
endif()
set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function.cc)
set(tmp_impl_file ${impl_file}.tmp)
set(op_function_output_path ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/)
set(impl_file1 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function1.cc)
set(tmp_impl_file1 ${impl_file1}.tmp)
set(impl_file2 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function2.cc)
set(tmp_impl_file2 ${impl_file2}.tmp)
set(impl_file3 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function3.cc)
set(tmp_impl_file3 ${impl_file3}.tmp)
set(impl_file4 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function4.cc)
set(tmp_impl_file4 ${impl_file4}.tmp)
set(impl_file5 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function5.cc)
set(tmp_impl_file5 ${impl_file5}.tmp)
set(impl_file6 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function6.cc)
set(tmp_impl_file6 ${impl_file6}.tmp)
set(impl_file7 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function7.cc)
set(tmp_impl_file7 ${impl_file7}.tmp)
set(impl_file8 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function8.cc)
set(tmp_impl_file8 ${impl_file8}.tmp)
set(CODE_GEN_SPLIT_FILE_COUNT "8")
set(eager_impl_file
${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function.cc)
set(tmp_eager_impl_file ${eager_impl_file}.tmp)
execute_process(
COMMAND
"${PYTHON_EXECUTABLE}"
"${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py"
"${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/"
"${CODE_GEN_SPLIT_FILE_COUNT}")
set(OP_IMPL_DEPS op_function_generator)
set(EAGER_OP_IMPL_DEPS eager_op_function_generator
eager_final_state_python_c_codegen)
......@@ -292,7 +320,7 @@ if(WITH_PYTHON)
":retry\n"
"ECHO op_function_generator run %build_times% time\n"
"taskkill /f /im op_function_generator.exe 2>NUL\n"
"${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n"
"${op_impl_path}/op_function_generator.exe ${op_function_output_path} ${CODE_GEN_SPLIT_FILE_COUNT}\n"
"if %ERRORLEVEL% NEQ 0 (\n"
" set /a build_times=%build_times%+1\n"
" if %build_times% GEQ 10 (\n"
......@@ -367,12 +395,33 @@ if(WITH_PYTHON)
endif()
add_custom_command(
OUTPUT ${impl_file}
OUTPUT op_function
COMMAND
${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file}
${impl_file}
COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1}
${impl_file1}
COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2}
${impl_file2}
COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3}
${impl_file3}
COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4}
${impl_file4}
COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5}
${impl_file5}
COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6}
${impl_file6}
COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7}
${impl_file7}
COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8}
${impl_file8}
COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}"
DEPENDS ${OP_IMPL_DEPS})
if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
add_custom_command(
......@@ -431,13 +480,35 @@ if(WITH_PYTHON)
list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
endif()
add_custom_command(
OUTPUT ${impl_file}
OUTPUT op_function
COMMAND
${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
"${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" "${tmp_impl_file}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file}
${impl_file}
COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
"${CMAKE_CURRENT_BINARY_DIR}/op_function_generator"
"${op_function_output_path}" "${CODE_GEN_SPLIT_FILE_COUNT}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1}
${impl_file1}
COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2}
${impl_file2}
COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3}
${impl_file3}
COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4}
${impl_file4}
COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5}
${impl_file5}
COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6}
${impl_file6}
COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7}
${impl_file7}
COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}"
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8}
${impl_file8}
COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}"
DEPENDS ${OP_IMPL_DEPS}
VERBATIM)
if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
......@@ -454,19 +525,13 @@ if(WITH_PYTHON)
VERBATIM)
endif()
endif()
add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
add_custom_target(op_function_generator_cmd ALL DEPENDS op_function)
if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
add_custom_target(eager_op_function_generator_cmd ALL
DEPENDS ${eager_impl_file})
endif()
list(APPEND PYBIND_DEPS interpretercore standalone_executor
staticgraph_executor_statistics)
cc_library(
op_function_common
SRCS op_function_common.cc
DEPS ${PYBIND_DEPS})
list(APPEND PYBIND_DEPS op_function_common)
list(APPEND PYBIND_DEPS standalone_executor staticgraph_executor_statistics)
if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
set(PYBIND_SRCS eager.cc ${PYBIND_SRCS})
......@@ -482,7 +547,6 @@ if(WITH_PYTHON)
list(APPEND PYBIND_DEPS backward)
list(APPEND PYBIND_DEPS grad_node_info)
list(APPEND PYBIND_DEPS phi)
list(APPEND PYBIND_DEPS op_function_common)
list(APPEND PYBIND_DEPS final_dygraph_function)
list(APPEND PYBIND_DEPS final_dygraph_node)
list(APPEND PYBIND_DEPS dygraph_function)
......
......@@ -16,12 +16,16 @@ import sys
import os
if __name__ == "__main__":
assert len(sys.argv) == 2
assert len(sys.argv) == 3
pybind_dir = sys.argv[1]
split_count = int(sys.argv[2])
empty_files = [os.path.join(pybind_dir, "eager_final_state_op_function.cc")]
empty_files.append(os.path.join(pybind_dir, "eager_op_function.cc"))
empty_files.append(os.path.join(pybind_dir, "op_function.cc"))
for i in range(split_count):
empty_files.append(
os.path.join(pybind_dir, "op_function" + str(i + 1) + ".cc"))
for path in empty_files:
if not os.path.exists(path):
......
......@@ -64,6 +64,7 @@ limitations under the License. */
namespace paddle {
namespace pybind {
std::atomic<int> VarBaseUniqueNameID{0};
PyTypeObject *g_varbase_pytype = nullptr;
namespace py = ::pybind11;
......@@ -497,7 +498,14 @@ static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src, // NOLINT
void BindImperative(py::module *m_ptr) {
auto &m = *m_ptr;
BindOpFunctions(&m);
BindOpFunctions1(&m);
BindOpFunctions2(&m);
BindOpFunctions3(&m);
BindOpFunctions4(&m);
BindOpFunctions5(&m);
BindOpFunctions6(&m);
BindOpFunctions7(&m);
BindOpFunctions8(&m);
#ifndef _WIN32
// Dygraph DataLoader signal handler
......
......@@ -257,7 +257,14 @@ PyObject* MakeReturnPyObject(const std::tuple<Args...>& out) {
return result;
}
void BindOpFunctions(pybind11::module* module);
void BindOpFunctions1(pybind11::module* module);
void BindOpFunctions2(pybind11::module* module);
void BindOpFunctions3(pybind11::module* module);
void BindOpFunctions4(pybind11::module* module);
void BindOpFunctions5(pybind11::module* module);
void BindOpFunctions6(pybind11::module* module);
void BindOpFunctions7(pybind11::module* module);
void BindOpFunctions8(pybind11::module* module);
} // namespace pybind
} // namespace paddle
......@@ -422,13 +422,17 @@ std::string GenerateOpFunctionsBody(
return op_function_str;
}
static std::tuple<std::vector<std::string>, std::vector<std::string>>
GenerateOpFunctions() {
static std::vector<
std::tuple<std::vector<std::string>, std::vector<std::string>>>
GenerateOpFunctions(int split_count) {
auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
std::vector<std::tuple<std::vector<std::string>, std::vector<std::string>>>
result;
std::vector<std::string> op_function_list, bind_function_list;
auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
paddle::flat_hash_map<std::string, paddle::framework::OpInfo>
op_info_map_need_gen;
for (auto& pair : op_info_map) {
auto& op_info = pair.second;
auto op_proto = op_info.proto_;
......@@ -444,6 +448,22 @@ GenerateOpFunctions() {
continue;
}
op_info_map_need_gen.emplace(pair);
}
int cc_file_api_size = op_info_map_need_gen.size() / split_count;
if (op_info_map_need_gen.size() % split_count != 0) {
cc_file_api_size++;
}
int api_index = 0;
int file_index = 0;
for (auto& pair : op_info_map_need_gen) {
auto& op_info = pair.second;
auto op_proto = op_info.proto_;
auto& op_type = op_proto->type();
// NOTE(pangyoki): Inplace Strategy.
// In this case, output will reuse input varbase.
// Dygraph mode needs to be aligned with the in-place strategy in static
......@@ -489,13 +509,24 @@ GenerateOpFunctions() {
op_function_list.emplace_back(std::move(inplace_op_function_str));
bind_function_list.emplace_back(std::move(inplace_bind_function_str));
}
api_index++;
if (api_index / cc_file_api_size > file_index) {
file_index++;
result.push_back(std::make_tuple(op_function_list, bind_function_list));
op_function_list.clear();
bind_function_list.clear();
}
}
return std::make_tuple(op_function_list, bind_function_list);
result.push_back(std::make_tuple(op_function_list, bind_function_list));
return result;
}
int main(int argc, char* argv[]) {
if (argc != 2) {
std::cerr << "argc must be 2" << std::endl;
if (argc != 3) {
std::cerr << "argc must be 3" << std::endl;
return -1;
}
......@@ -513,39 +544,45 @@ int main(int argc, char* argv[]) {
"\"paddle/fluid/pybind/op_function.h\"",
"<Python.h>"};
std::ofstream out(argv[1], std::ios::out);
std::string path = argv[1];
int split_count = atoi(argv[2]);
for (auto& header : headers) {
out << "#include " + header + "\n";
}
auto op_funcs = GenerateOpFunctions(split_count);
out << "\n\n";
auto op_funcs = GenerateOpFunctions();
out << "namespace paddle {\n"
<< "namespace pybind {\n\n";
out << "std::atomic<int> VarBaseUniqueNameID{0};\n";
out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
out << "\n\n";
out << "static PyMethodDef ExtestMethods[] = {\n"
<< paddle::string::join_strings(std::get<1>(op_funcs), '\n')
<< "\n {nullptr,nullptr,0,nullptr}"
<< "};\n\n";
out << "void BindOpFunctions(pybind11::module *module) {\n"
<< " auto m = module->def_submodule(\"ops\");\n"
<< " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
<< " PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
"core.ops failed!\"));\n"
<< " }\n\n"
<< " InitOpsAttrTypeMap();"
<< "}\n\n"
<< "} // namespace pybind\n"
<< "} // namespace paddle\n";
out.close();
for (size_t i = 0; i < op_funcs.size(); i++) {
std::ofstream out(path + "op_function" + std::to_string(i + 1) + ".cc.tmp",
std::ios::out);
for (auto& header : headers) {
out << "#include " + header + "\n";
}
out << "\n\n";
out << "namespace paddle {\n"
<< "namespace pybind {\n\n";
out << "extern std::atomic<int> VarBaseUniqueNameID;\n";
out << paddle::string::join_strings(std::get<0>(op_funcs[i]), '\n');
out << "\n\n";
out << "static PyMethodDef ExtestMethods[] = {\n"
<< paddle::string::join_strings(std::get<1>(op_funcs[i]), '\n')
<< "\n {nullptr,nullptr,0,nullptr}"
<< "};\n\n";
out << "void BindOpFunctions" << i + 1 << "(pybind11::module *module) {\n"
<< " auto m = module->def_submodule(\"ops\");\n"
<< " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
<< " PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
"core.ops failed!\"));\n"
<< " }\n\n"
<< " InitOpsAttrTypeMap();"
<< "}\n\n"
<< "} // namespace pybind\n"
<< "} // namespace paddle\n";
out.close();
}
#ifdef PADDLE_WITH_ASCEND_CL
ge::GEFinalize();
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <Python.h>
#include <algorithm>
#include <cctype>
#include <cstdlib>
#include <iterator>
#include <map>
#include <memory>
#include <mutex> // NOLINT // for call_once
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/custom_operator.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor_cache.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
#include "paddle/fluid/framework/ir/cost_model.h"
#include "paddle/fluid/framework/ir/generate_pass.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/new_executor/executor_statistics.h"
#include "paddle/fluid/framework/new_executor/standalone_executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/save_load_util.h"
#include "paddle/fluid/framework/scope_pool.h"
#include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/trainer.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
#endif
#include "paddle/fluid/memory/allocation/mmap_allocator.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/profiler.h"
#include "paddle/fluid/pybind/cuda_streams_py.h"
#include "paddle/fluid/pybind/distributed_py.h"
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
#ifdef PADDLE_WITH_ASCEND
#include "paddle/fluid/pybind/ascend_wrapper_py.h"
#endif
#include "paddle/fluid/pybind/bind_cost_model.h"
#include "paddle/fluid/pybind/bind_fleet_executor.h"
#include "paddle/fluid/pybind/box_helper_py.h"
#include "paddle/fluid/pybind/communication.h"
#include "paddle/fluid/pybind/compatible.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/data_set_py.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/fleet_wrapper_py.h"
#include "paddle/fluid/pybind/generator_py.h"
#include "paddle/fluid/pybind/global_value_getter_setter.h"
#include "paddle/fluid/pybind/gloo_context_py.h"
#include "paddle/fluid/pybind/gloo_wrapper_py.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
#include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/metrics_py.h"
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#include "paddle/phi/backends/device_manager.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
#endif
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#ifndef PADDLE_WITH_HIP
#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
#endif
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/capi/capi.h"
#endif
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
#endif
#if defined PADDLE_WITH_PSCORE
#include "paddle/fluid/pybind/fleet_py.h"
#endif
#ifdef PADDLE_WITH_CINN
#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
#endif
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/imperative/layout_autotune.h"
#include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/fluid/pybind/parallel_executor.h"
#include "paddle/phi/api/ext/op_meta_info.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "pybind11/stl.h"
DECLARE_bool(use_mkldnn);
// disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace paddle {
namespace pybind {
using namespace paddle::framework; // NOLINT
void BindParallelExecutor(pybind11::module &m) { // NOLINT
// -- python binds for parallel executor.
py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
ExecutionStrategy allows the user to more preciously control how to run
the program in ParallelExecutor by setting the property.
Returns:
ExecutionStrategy: An ExecutionStrategy object.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
import paddle.nn.functional as F
paddle.enable_static()
x = static.data(name='x', shape=[None, 13], dtype='float32')
y = static.data(name='y', shape=[None, 1], dtype='float32')
y_predict = static.nn.fc(input=x, size=1, act=None)
cost = F.square_error_cost(input=y_predict, label=y)
avg_loss = paddle.mean(cost)
sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_loss)
exec_strategy = static.ExecutionStrategy()
exec_strategy.num_threads = 4
train_exe = static.ParallelExecutor(use_cuda=False,
loss_name=avg_loss.name,
exec_strategy=exec_strategy)
)DOC");
py::enum_<paddle::platform::DeviceType>(m, "DeviceType", py::arithmetic())
.value("CPU", paddle::platform::DeviceType::CPU)
.value("CUDA", paddle::platform::DeviceType::CUDA)
.value("XPU", paddle::platform::DeviceType::XPU);
exec_strategy.def(py::init())
.def_property(
"num_threads",
[](const ExecutionStrategy &self) { return self.num_threads_; },
[](ExecutionStrategy &self, size_t num_threads) {
self.num_threads_ = num_threads;
},
R"DOC(
The type is INT, num_threads represents the size of thread pool that
used to run the operators of the current program in ParallelExecutor.
If :math:`num\_threads=1`, all the operators will execute one by one,
but the order maybe difference between iterations.
If it is not set, it will be set in ParallelExecutor according to the
device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
:math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
if it is not set, ParallelExecutor will get the cpu count by calling
`multiprocessing.cpu_count()`. Default 0.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
exec_strategy = static.ExecutionStrategy()
exec_strategy.num_threads = 4
)DOC")
.def_property(
"_use_device",
[](const ExecutionStrategy &self) { return self.use_device_; },
[](ExecutionStrategy &self, paddle::platform::DeviceType use_device) {
self.use_device_ = use_device;
}) // NOTE(liuyuhui): Doesn't add doc for 'use_device', because
// use_device isn‘t exposed to users.
.def_property(
"allow_op_delay",
[](const ExecutionStrategy &self) { return self.allow_op_delay_; },
[](ExecutionStrategy &self, bool allow_op_delay) {
self.allow_op_delay_ = allow_op_delay;
},
R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
communication operators to run, it may make the execution faster.
Note that this option is invalid now, and it will be removed in
next version. Default False.)DOC")
.def_property(
"num_iteration_per_drop_scope",
[](const ExecutionStrategy &self) {
return self.num_iteration_per_drop_scope_;
},
[](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
},
R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
many iterations to clean up the temp variables which
is generated during execution. It may make the execution faster,
because the temp variable's shape maybe the same between two iterations.
Default 100.
.. note::
1. If you fetch data when calling the 'run', the ParallelExecutor
will clean up the temp variables at the end of the current iteration.
2. In some NLP model, it may cause the GPU memory is insufficient,
in this case, you should reduce `num_iteration_per_drop_scope`.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
exec_strategy = static.ExecutionStrategy()
exec_strategy.num_iteration_per_drop_scope = 10
)DOC")
.def_property(
"num_iteration_per_run",
[](const ExecutionStrategy &self) {
return self.num_iteration_per_run_;
},
[](ExecutionStrategy &self, size_t num_iteration_per_run) {
self.num_iteration_per_run_ = num_iteration_per_run;
},
R"DOC(This config that how many iteration the executor will run when
user call exe.run() in python。Default: 1.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
exec_strategy = static.ExecutionStrategy()
exec_strategy.num_iteration_per_run = 10
)DOC")
.def_property(
"use_thread_barrier",
[](const ExecutionStrategy &self) { return self.thread_barrier_; },
[](ExecutionStrategy &self, bool use_thread_barrier) {
self.thread_barrier_ = use_thread_barrier;
},
R"DOC(This config that the this is distributed training with parameter server
)DOC")
.def_property(
"_dry_run",
[](const ExecutionStrategy &self) { return self.dry_run_; },
[](ExecutionStrategy &self, bool dry_run) {
self.dry_run_ = dry_run;
});
exec_strategy.def_property(
"use_experimental_executor",
[](const ExecutionStrategy &self) {
return self.type_ == ExecutionStrategy::kExperimental;
},
[](ExecutionStrategy &self, bool experimental) {
self.type_ = experimental ? ExecutionStrategy::kExperimental
: ExecutionStrategy::kDefault;
});
py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
BuildStrategy allows the user to more preciously control how to
build the SSA Graph in ParallelExecutor by setting the property.
Returns:
BuildStrategy: An BuildStrategy object.
Examples:
.. code-block:: python
import os
import paddle
import paddle.static as static
paddle.enable_static()
os.environ['CPU_NUM'] = str(2)
places = static.cpu_places()
data = static.data(name="x", shape=[None, 1], dtype="float32")
hidden = static.nn.fc(input=data, size=10)
loss = paddle.mean(hidden)
paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
build_strategy = static.BuildStrategy()
build_strategy.enable_inplace = True
build_strategy.memory_optimize = True
build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
program = static.CompiledProgram(static.default_main_program())
program = program.with_data_parallel(loss_name=loss.name,
build_strategy=build_strategy,
places=places)
)DOC");
py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
.value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
.value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce)
.value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce);
py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
"GradientScaleStrategy")
.value("CoeffNumDevice",
BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
.value("One", BuildStrategy::GradientScaleStrategy::kOne)
.value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
build_strategy.def(py::init())
.def("_clear_finalized", &BuildStrategy::ClearFinalized)
.def_property(
"reduce_strategy",
[](const BuildStrategy &self) { return self.reduce_; },
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.reduce_ = strategy;
},
R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce
strategies in ParallelExecutor, AllReduce and Reduce. If you want
that all the parameters' optimization are done on all devices independently,
you should choose AllReduce; otherwise, if you choose Reduce, all the parameters'
optimization will be evenly distributed to different devices, and then
broadcast the optimized parameter to other devices.
Default is 'AllReduce'.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
)DOC")
.def_property(
"gradient_scale_strategy",
[](const BuildStrategy &self) { return self.gradient_scale_; },
[](BuildStrategy &self,
BuildStrategy::GradientScaleStrategy strategy) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.gradient_scale_ = strategy;
},
R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three
ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice,
One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
according to the number of devices. If you want to customize :math:`loss@grad`,
you can choose Customized. Default is 'CoeffNumDevice'.
Examples:
.. code-block:: python
import numpy
import os
import paddle
import paddle.static as static
paddle.enable_static()
use_cuda = True
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
exe = static.Executor(place)
# NOTE: If you use CPU to run the program, you need
# to specify the CPU_NUM, otherwise, paddle will use
# all the number of the logic core as the CPU_NUM,
# in that case, the batch size of the input should be
# greater than CPU_NUM, if not, the process will be
# failed by an exception.
if not use_cuda:
os.environ['CPU_NUM'] = str(2)
places = static.cpu_places()
else:
places = static.cuda_places()
data = static.data(name='X', shape=[None, 1], dtype='float32')
hidden = static.nn.fc(input=data, size=10)
loss = paddle.mean(hidden)
paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
exe.run(static.default_startup_program())
build_strategy = static.BuildStrategy()
build_strategy.gradient_scale_strategy = \
static.BuildStrategy.GradientScaleStrategy.Customized
compiled_prog = static.CompiledProgram(
static.default_main_program()).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy,
places=places)
dev_count = len(places)
x = numpy.random.random(size=(10, 1)).astype('float32')
loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
loss_grad_name = loss.name+"@GRAD"
loss_data = exe.run(compiled_prog,
feed={"X": x, loss_grad_name : loss_grad},
fetch_list=[loss.name, loss_grad_name])
)DOC")
.def_property(
"debug_graphviz_path",
[](const BuildStrategy &self) { return self.debug_graphviz_path_; },
[](BuildStrategy &self, const std::string &path) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.debug_graphviz_path_ = path;
},
R"DOC((str, optional): debug_graphviz_path indicates the path that
writing the SSA Graph to file in the form of graphviz.
It is useful for debugging. Default is empty string, that is, ""
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.debug_graphviz_path = "./graph"
)DOC")
.def_property(
"enable_sequential_execution",
[](const BuildStrategy &self) {
return self.enable_sequential_execution_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.enable_sequential_execution_ = b;
},
R"DOC((bool, optional): If set True, the execution order of ops would
be the same as what is in the program. Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.enable_sequential_execution = True
)DOC")
.def_property(
"remove_unnecessary_lock",
[](const BuildStrategy &self) {
return self.remove_unnecessary_lock_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.remove_unnecessary_lock_ = b;
},
R"DOC((bool, optional): If set True, some locks in GPU ops would be
released and ParallelExecutor would run faster. Default is True.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.remove_unnecessary_lock = True
)DOC")
.def_property(
"num_trainers",
[](const BuildStrategy &self) { return self.num_trainers_; },
[](BuildStrategy &self, int num_trainers) {
#ifdef WIN32
PADDLE_THROW(platform::errors::Unavailable(
"Distribution mode is not supported on Windows platform."));
#endif
self.num_trainers_ = num_trainers;
})
.def_property(
"trainers_endpoints",
[](const BuildStrategy &self) { return self.trainers_endpoints_; },
[](BuildStrategy &self,
const std::vector<std::string> &trainers_endpoints) {
self.trainers_endpoints_ = trainers_endpoints;
})
.def_property(
"trainer_id",
[](const BuildStrategy &self) { return self.trainer_id_; },
[](BuildStrategy &self, int trainer_id) {
self.trainer_id_ = trainer_id;
})
.def_property(
"nccl_comm_num",
[](const BuildStrategy &self) { return self.nccl_comm_num_; },
[](BuildStrategy &self, int nccl_comm_num) {
self.nccl_comm_num_ = nccl_comm_num;
})
.def_property(
"bkcl_comm_num",
[](const BuildStrategy &self) { return self.bkcl_comm_num_; },
[](BuildStrategy &self, int bkcl_comm_num) {
self.bkcl_comm_num_ = bkcl_comm_num;
})
.def_property(
"use_hierarchical_allreduce",
[](const BuildStrategy &self) {
return self.use_hierarchical_allreduce_;
},
[](BuildStrategy &self, bool use) {
self.use_hierarchical_allreduce_ = use;
})
.def_property(
"hierarchical_allreduce_inter_nranks",
[](const BuildStrategy &self) {
return self.hierarchical_allreduce_inter_nranks_;
},
[](BuildStrategy &self, int nranks) {
self.hierarchical_allreduce_inter_nranks_ = nranks;
})
.def_property(
"fuse_elewise_add_act_ops",
[](const BuildStrategy &self) {
return self.fuse_elewise_add_act_ops_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_elewise_add_act_ops_ = b;
},
R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether
to fuse elementwise_add_op and activation_op,
it may make the execution faster. Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_elewise_add_act_ops = True
)DOC")
.def_property(
"fuse_gemm_epilogue",
[](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_gemm_epilogue_ = b;
},
R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
to fuse matmul_op, elemenewist_add_op and activation_op,
it may make the execution faster. Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_gemm_epilogue = True
)DOC")
.def_property(
"fuse_bn_act_ops",
[](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_bn_act_ops_ = b;
},
R"DOC((bool, optional): fuse_bn_act_ops indicate whether
to fuse batch_norm and activation_op,
it may make the execution faster. Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_bn_act_ops = True
)DOC")
.def_property(
"fuse_bn_add_act_ops",
[](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_bn_add_act_ops_ = b;
},
R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether
to fuse batch_norm, elementwise_add and activation_op,
it may make the execution faster. Default is True
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_bn_add_act_ops = True
)DOC")
.def_property(
"enable_auto_fusion",
[](const BuildStrategy &self) { return self.enable_auto_fusion_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.enable_auto_fusion_ = b;
},
R"DOC((bool, optional): Whether to enable fusing subgraph to a
fusion_group. Now we only support fusing subgraph that composed
of elementwise-like operators, such as elementwise_add/mul
without broadcast and activations.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.enable_auto_fusion = True
)DOC")
.def_property(
"fuse_relu_depthwise_conv",
[](const BuildStrategy &self) {
return self.fuse_relu_depthwise_conv_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_relu_depthwise_conv_ = b;
},
R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether
to fuse relu and depthwise_conv2d,
it will save GPU memory and may make the execution faster.
This options is only available in GPU devices.
Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_relu_depthwise_conv = True
)DOC")
.def_property(
"fuse_broadcast_ops",
[](const BuildStrategy &self) {
return self.fuse_broadcast_ops_ == true ||
self.fuse_broadcast_ops_ == paddle::none;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, "
"cannot be configured again."));
self.fuse_broadcast_ops_ = b;
},
R"DOC((bool, optional): fuse_broadcast_op indicates whether
to fuse the broadcast ops. Note that, in Reduce mode,
fusing broadcast ops may make the program faster. Because
fusing broadcast OP equals delaying the execution of all
broadcast Ops, in this case, all nccl streams are used only
for NCCLReduce operations for a period of time. Default False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_broadcast_ops = True
)DOC")
.def_property(
"fuse_all_optimizer_ops",
[](const BuildStrategy &self) {
return self.fuse_all_optimizer_ops_ == true ||
self.fuse_all_optimizer_ops_ == paddle::none;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, "
"cannot be configured again."));
self.fuse_all_optimizer_ops_ = b;
})
.def_property(
"sync_batch_norm",
[](const BuildStrategy &self) { return self.sync_batch_norm_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.sync_batch_norm_ = b;
},
R"DOC((bool, optional): sync_batch_norm indicates whether to use
synchronous batch normalization which synchronizes the mean
and variance through multi-devices in training phase.
Current implementation doesn't support FP16 training and CPU.
And only synchronous on one machine, not all machines.
Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.sync_batch_norm = True
)DOC")
.def_property(
"memory_optimize",
[](const BuildStrategy &self) -> py::object {
if (self.memory_optimize_) {
return py::cast(self.memory_optimize_.get());
} else {
return py::cast(nullptr);
}
},
[](BuildStrategy &self, const py::handle &value) {
auto *py_obj = value.ptr();
if (py_obj == nullptr || py_obj == Py_None) {
self.memory_optimize_ = paddle::none;
} else if (PyBool_Check(py_obj)) {
self.memory_optimize_ = (py_obj == Py_True);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"BuildStrategy.memory_optimize must be set to None, False "
"or True"));
}
},
R"DOC((bool, optional): memory opitimize aims to save total memory
consumption, set to True to enable it.
Default None. None means framework would choose to use or not use
this strategy automatically. Currently, None means that it is
enabled when GC is disabled, and disabled when GC is enabled.
True means enabling and False means disabling. Default is None.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.memory_optimize = True
)DOC")
.def_property(
"is_distribution",
[](const BuildStrategy &self) { return self.is_distribution_; },
[](BuildStrategy &self, bool b) {
#ifdef WIN32
if (b) {
PADDLE_THROW(platform::errors::Unavailable(
"Distribution mode is not supported on Windows platform."));
}
#else
self.is_distribution_ = b;
#endif
})
.def_property(
"async_mode",
[](const BuildStrategy &self) { return self.async_mode_; },
[](BuildStrategy &self, bool b) { self.async_mode_ = b; })
.def_property(
"enable_inplace",
[](const BuildStrategy &self) { return self.enable_inplace_; },
[](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
.def_property(
"enable_addto",
[](const BuildStrategy &self) { return self.enable_addto_; },
[](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
.def_property(
"fuse_all_reduce_ops",
[](const BuildStrategy &self) {
return self.fuse_all_reduce_ops_ == true ||
self.fuse_all_reduce_ops_ == paddle::none;
},
[](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
.def_property(
"enable_backward_optimizer_op_deps",
[](const BuildStrategy &self) {
return self.enable_backward_optimizer_op_deps_;
},
[](BuildStrategy &self, bool b) {
self.enable_backward_optimizer_op_deps_ = b;
})
.def_property(
"cache_runtime_context",
[](const BuildStrategy &self) { return self.cache_runtime_context_; },
[](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
.def_property(
"mkldnn_enabled_op_types",
[](const BuildStrategy &self) {
return self.mkldnn_enabled_op_types_;
},
[](BuildStrategy &self,
const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
})
.def_property(
"fix_op_run_order",
[](const BuildStrategy &self) { return self.fix_op_run_order_; },
[](BuildStrategy &self, bool fix_op_run_order) {
self.fix_op_run_order_ = fix_op_run_order;
})
.def_property(
"allow_cuda_graph_capture",
[](const BuildStrategy &self) {
return self.allow_cuda_graph_capture_;
},
[](BuildStrategy &self, bool allow_cuda_graph_capture) {
self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
})
.def("_copy",
[](const BuildStrategy &self) {
auto new_bs = self;
new_bs.ClearFinalized();
return new_bs;
})
.def(
"_finalize_strategy_and_create_passes",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy(true);
},
R"DOC(Allow user to customized passes. Normally model-specific
optimization passes should be defined in this way. BuildStrategy
cannot be updated after being finalized.)DOC");
m.def("_set_cached_executor_build_strategy",
[](int64_t program_id, const BuildStrategy &build_strategy) {
auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
cached_exe_info.SetBuildStrategy(program_id, build_strategy);
});
pe.def(py::init<const std::vector<platform::Place> &,
const std::vector<std::string> &,
const std::string &,
Scope *,
std::vector<Scope *> &,
const ExecutionStrategy &,
const BuildStrategy &,
ir::Graph *>())
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
// one by one and mark them as reference.
.def(
"local_scopes",
[](ParallelExecutor &self) -> std::vector<Scope *> * {
return &self.GetLocalScopes();
},
py::return_value_policy::reference)
.def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
.def("_need_create_local_exe_scopes",
&ParallelExecutor::NeedCreateLocalExeScope)
.def("feed_tensors_into_local_scopes",
&ParallelExecutor::FeedTensorsIntoLocalScopes)
.def("feed_and_split_tensor_into_local_scopes",
&ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
.def("run",
[](ParallelExecutor &self,
const std::vector<std::string> &fetch_tensors,
bool return_merged) -> py::object {
if (return_merged) {
paddle::framework::FetchList ret;
/*gil_scoped_release*/ {
pybind11::gil_scoped_release release;
ret = self.RunAndMerge(fetch_tensors);
}
return py::cast(std::move(ret));
} else {
paddle::framework::FetchUnmergedList ret;
/*gil_scoped_release*/ {
pybind11::gil_scoped_release release;
ret = self.Run(fetch_tensors);
}
return py::cast(std::move(ret));
}
})
.def("device_count", &ParallelExecutor::DeviceCount);
using VarQuantScale =
std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
pass.def(py::init())
.def("has", &ir::Pass::Has)
.def("set_not_owned",
[](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
self.SetNotOwned<ProgramDesc>(attr_name, &attr);
})
.def(
"set",
[](ir::Pass &self, const std::string &name, const std::string &attr) {
self.Set<std::string>(name, new std::string(attr));
})
.def("set",
[](ir::Pass &self, const std::string &name, bool val) {
self.Set<bool>(name, new bool(val));
})
.def("set",
[](ir::Pass &self, const std::string &name, int val) {
self.Set<const int>(name, new int(val));
})
.def("set",
[](ir::Pass &self,
const std::string &name,
std::vector<std::string> set) {
self.Set(name, new std::vector<std::string>(set));
})
.def("set",
[](ir::Pass &self,
const std::string &name,
std::unordered_set<std::string> set) {
self.Set(name, new std::unordered_set<std::string>(set));
})
.def("set",
[](ir::Pass &self,
const std::string &name,
std::unordered_set<int> set) {
self.Set(name, new std::unordered_set<int>(set));
})
.def("set",
[](ir::Pass &self, const std::string &name, VarQuantScale scales) {
self.Set(name, new VarQuantScale(scales));
})
.def("type", &ir::Pass::Type)
.def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
self.Apply(graph.get());
});
py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
m, "PassBuilder");
pb.def(py::init())
.def("append_pass",
[](ir::PassBuilder &self,
const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
return self.AppendPass(pass_type);
})
.def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
.def("insert_pass",
[](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
return self.InsertPass(idx, pass_type);
})
.def("remove_pass",
[](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
}
} // namespace pybind
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
namespace paddle {
namespace pybind {
void BindParallelExecutor(pybind11::module& m); // NOLINT
} // namespace pybind
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <Python.h>
#include <algorithm>
#include <cctype>
#include <cstdlib>
#include <iterator>
#include <map>
#include <memory>
#include <mutex> // NOLINT // for call_once
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/custom_operator.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor_cache.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
#include "paddle/fluid/framework/ir/cost_model.h"
#include "paddle/fluid/framework/ir/generate_pass.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/new_executor/executor_statistics.h"
#include "paddle/fluid/framework/new_executor/standalone_executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/save_load_util.h"
#include "paddle/fluid/framework/scope_pool.h"
#include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/trainer.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
#endif
#include "paddle/fluid/memory/allocation/mmap_allocator.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/profiler.h"
#include "paddle/fluid/pybind/cuda_streams_py.h"
#include "paddle/fluid/pybind/distributed_py.h"
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
#ifdef PADDLE_WITH_ASCEND
#include "paddle/fluid/pybind/ascend_wrapper_py.h"
#endif
#include "paddle/fluid/pybind/bind_cost_model.h"
#include "paddle/fluid/pybind/bind_fleet_executor.h"
#include "paddle/fluid/pybind/box_helper_py.h"
#include "paddle/fluid/pybind/communication.h"
#include "paddle/fluid/pybind/compatible.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/data_set_py.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/fleet_wrapper_py.h"
#include "paddle/fluid/pybind/generator_py.h"
#include "paddle/fluid/pybind/global_value_getter_setter.h"
#include "paddle/fluid/pybind/gloo_context_py.h"
#include "paddle/fluid/pybind/gloo_wrapper_py.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
#include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/metrics_py.h"
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#include "paddle/phi/backends/device_manager.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
#endif
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#ifndef PADDLE_WITH_HIP
#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
#endif
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/capi/capi.h"
#endif
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
#endif
#if defined PADDLE_WITH_PSCORE
#include "paddle/fluid/pybind/fleet_py.h"
#endif
#ifdef PADDLE_WITH_CINN
#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
#endif
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/imperative/layout_autotune.h"
#include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/fluid/pybind/place.h"
#include "paddle/phi/api/ext/op_meta_info.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "pybind11/stl.h"
DECLARE_bool(use_mkldnn);
// disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace paddle {
namespace pybind {
PyTypeObject *g_place_pytype = nullptr;
PyTypeObject *g_customplace_pytype = nullptr;
PyTypeObject *g_cudaplace_pytype = nullptr;
PyTypeObject *g_cpuplace_pytype = nullptr;
PyTypeObject *g_xpuplace_pytype = nullptr;
PyTypeObject *g_npuplace_pytype = nullptr;
PyTypeObject *g_cudapinnedplace_pytype = nullptr;
PyTypeObject *g_mluplace_pytype = nullptr;
template <typename PlaceType>
static inline int PlaceIndex(const PlaceType &p) { // NOLINT
return static_cast<int>(paddle::platform::Place(p).GetType());
}
template <typename PlaceType1, typename PlaceType2>
static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
return paddle::platform::Place(p1) == paddle::platform::Place(p2);
}
void BindPlace(pybind11::module &m) { // NOLINT
using namespace paddle::framework; // NOLINT
py::class_<platform::CustomPlace> customplace(m,
"CustomPlace",
R"DOC(
CustomPlace is a descriptor of a device.
It represents a custom device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
fake_cpu_place = paddle.CustomPlace("FakeCPU", 0)
)DOC");
g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
customplace
.def("__init__",
[](platform::CustomPlace &self,
const std::string &device_type,
int dev_id) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid CustomPlace(%s, %d), device id must be 0 "
"or "
"positive integer",
device_type,
dev_id);
std::exit(-1);
}
if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) &&
phi::DeviceManager::IsCustom(device_type))) {
int dev_count = static_cast<int>(
phi::DeviceManager::GetDeviceCount(device_type));
if (UNLIKELY(dev_id >= dev_count)) {
if (dev_count == 0) {
LOG(ERROR) << "Cannot use " << device_type
<< " because there is no " << device_type
<< " detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid CustomPlace(%s, %d), dev_id must "
"inside "
"[0, %d), because %s "
"number on your machine is %d",
device_type,
dev_id,
dev_count,
device_type,
dev_count);
std::exit(-1);
}
}
new (&self) platform::CustomPlace(device_type, dev_id);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid CustomPlace(%s, %d), the device type is "
"not registered "
"as a custom device.",
device_type,
dev_id);
std::exit(-1);
}
#else
LOG(ERROR) << string::Sprintf(
"Cannot use CustomDevice because you have installed CPU/GPU"
"version PaddlePaddle.\n"
"If you want to use CustomDevice, please try to install"
"CustomDevice version "
"PaddlePaddle by: pip install paddlepaddle\n"
"If you only have CPU, please change "
"CustomPlace(%s, %d) to be CPUPlace().\n",
device_type, dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::CustomPlace>)
.def("get_device_id",
[](const platform::CustomPlace &self) { return self.GetDeviceId(); })
.def("get_device_type",
[](const platform::CustomPlace &self) {
return self.GetDeviceType();
})
.def("__repr__", string::to_string<const platform::CustomPlace &>)
.def("__str__", string::to_string<const platform::CustomPlace &>);
py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
CUDAPlace is a descriptor of a device.
It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace,
staring from 0.
The memory of CUDAPlace with different dev_id is not accessible.
Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card.
You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable.
When the program starts, visible GPU devices will be numbered from 0.
If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default,
and the logical ID is the same as the actual ID.
Parameters:
id (int): GPU device ID.
Examples:
.. code-block:: python
import paddle
place = paddle.CUDAPlace(0)
)DOC");
g_cudaplace_pytype = reinterpret_cast<PyTypeObject *>(cudaplace.ptr());
cudaplace
.def("__init__",
[](platform::CUDAPlace &self, int dev_id) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid CUDAPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) {
if (platform::GetGPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use GPU because there is no GPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
"number on your machine is %d",
dev_id,
platform::GetGPUDeviceCount(),
platform::GetGPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::CUDAPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use GPU because you have installed CPU version "
"PaddlePaddle.\n"
"If you want to use GPU, please try to install GPU version "
"PaddlePaddle by: pip install paddlepaddle-gpu\n"
"If you only have CPU, please change CUDAPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
.def("get_device_id",
[](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
.def("_type", &PlaceIndex<platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::MLUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
.def("_get_device_id",
[](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); })
#endif
.def("__repr__", string::to_string<const platform::CUDAPlace &>)
.def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
**Note**:
Examples:
.. code-block:: python
import paddle.fluid as fluid
xpu_place = fluid.XPUPlace(0)
)DOC");
g_xpuplace_pytype = reinterpret_cast<PyTypeObject *>(xpuplace.ptr());
xpuplace
.def("__init__",
[](platform::XPUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_XPU
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid XPUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
if (platform::GetXPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use XPU because there is no XPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid XPUPlace(%d), must inside [0, %d), because XPU "
"number on your machine is %d",
dev_id,
platform::GetXPUDeviceCount(),
platform::GetXPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::XPUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use XPU because you have installed CPU/GPU version "
"PaddlePaddle.\n"
"If you want to use XPU, please try to install XPU version "
"PaddlePaddle by: pip install paddlepaddle-xpu\n"
"If you only have CPU, please change XPUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
#ifdef PADDLE_WITH_XPU
.def("_type", &PlaceIndex<platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
.def("_equals",
&IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
.def("get_device_id",
[](const platform::XPUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__repr__", string::to_string<const platform::XPUPlace &>)
.def("__str__", string::to_string<const platform::XPUPlace &>);
#ifdef PADDLE_WITH_XPU
py::enum_<phi::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
.value("XPU1", phi::backends::xpu::XPUVersion::XPU1)
.value("XPU2", phi::backends::xpu::XPUVersion::XPU2)
.export_values();
m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
m.def("get_xpu_device_version",
[](int device_id) { return platform::get_xpu_version(device_id); });
#ifdef PADDLE_WITH_XPU_KP
m.def("get_xpu_device_op_support_types",
[](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
return platform::get_xpu_kp_op_support_type(op_name, version);
});
#else
m.def("get_xpu_device_op_support_types",
[](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
return platform::get_xpu_op_support_type(op_name, version);
});
#endif
m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
return platform::get_xpu_op_list(version);
});
m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
// XPUs with Compute Capability > xpu2 support float16 and bfloat16
return platform::get_xpu_version(place.device) >
phi::backends::xpu::XPUVersion::XPU1;
});
m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
// XPUs with Compute Capability > xpu2 support float16 and bfloat16
return platform::get_xpu_version(place.device) >
phi::backends::xpu::XPUVersion::XPU1;
});
#endif
py::class_<paddle::platform::CPUPlace> cpuplace(m, "CPUPlace", R"DOC(
CPUPlace is a descriptor of a device.
It represents a CPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
cpu_place = paddle.CPUPlace()
)DOC");
g_cpuplace_pytype = reinterpret_cast<PyTypeObject *>(cpuplace.ptr());
cpuplace.def(py::init<>())
.def("_type", &PlaceIndex<platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
.def("__repr__", string::to_string<const platform::CPUPlace &>)
.def("__str__", string::to_string<const platform::CPUPlace &>);
py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
m, "CUDAPinnedPlace", R"DOC(
CUDAPinnedPlace is a descriptor of a device.
It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory.
The host operating system will not paging and exchanging the memory.
It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU.
For more information on CUDA data transfer and `pinned memory`,
please refer to `official document <https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#pinned-memory>`_ .
Examples:
.. code-block:: python
import paddle
place = paddle.CUDAPinnedPlace()
)DOC");
g_cudapinnedplace_pytype =
reinterpret_cast<PyTypeObject *>(cudapinnedplace.ptr());
cudapinnedplace
.def("__init__",
[](platform::CUDAPinnedPlace &self) {
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPinnedPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
#endif
new (&self) platform::CUDAPinnedPlace();
})
.def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
.def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
.def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
// NPUPlace
py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
NPUPlace is a descriptor of a device.
It represents a NPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
npu_place = paddle.NPUPlace(0)
)DOC");
g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
npuplace
.def("__init__",
[](platform::NPUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_ASCEND_CL
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
if (platform::GetNPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use NPU because there is no NPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), must inside [0, %d), because NPU "
"number on your machine is %d",
dev_id,
platform::GetNPUDeviceCount(),
platform::GetNPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::NPUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use NPU because you have installed CPU/GPU version "
"PaddlePaddle.\n"
"If you want to use NPU, please try to install NPU version "
"PaddlePaddle by: pip install paddlepaddle-npu\n"
"If you only have CPU, please change NPUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
.def("_equals",
&IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
.def("get_device_id",
[](const platform::NPUPlace &self) { return self.GetDeviceId(); })
.def("__str__", string::to_string<const platform::NPUPlace &>);
// IPUPlace
py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
IPUPlace is a descriptor of a device.
It represents a IPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
# required: ipu
ipu_place = paddle.IPUPlace()
)DOC")
.def("__init__",
[](platform::IPUPlace &self) {
#ifdef PADDLE_WITH_IPU
if (platform::GetIPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use IPU because there is no IPU "
"detected on your "
"machine.";
std::exit(-1);
}
// use ipu(0) to comile, while run with the number user configure
// in sharding and pipline.
new (&self) platform::IPUPlace(0);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use IPU because you didn't install IPU version "
"PaddlePaddle.\n"
"If you want to use IPU, please try to install IPU version "
"PaddlePaddle by: pip install paddlepaddle*\n"
"If you only have CPU, please change IPUPlace to be "
"CPUPlace().\n");
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
.def("_equals",
&IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
#ifdef PADDLE_WITH_IPU
.def("get_device_id",
[](const platform::IPUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__str__", string::to_string<const platform::IPUPlace &>);
// MLUPlace
py::class_<platform::MLUPlace> mluplace(m, "MLUPlace", R"DOC(
MLUPlace is a descriptor of a device.
It represents a MLU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
# required: mlu
mlu_place = paddle.MLUPlace(0)
)DOC");
g_mluplace_pytype = reinterpret_cast<PyTypeObject *>(mluplace.ptr());
mluplace
.def("__init__",
[](platform::MLUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_MLU
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid MLUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) {
if (platform::GetMLUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use MLU because there is no MLU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid MLUPlace(%d), must inside [0, %d), because MLU "
"number on your machine is %d",
dev_id,
platform::GetMLUDeviceCount(),
platform::GetMLUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::MLUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use MLU because you have installed CPU/GPU/... "
"version "
"PaddlePaddle.\n"
"If you want to use MLU, please try to install MLU version "
"PaddlePaddle by: pip install paddlepaddle-mlu\n"
"If you only have CPU, please change MLUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::MLUPlace>)
#ifdef PADDLE_WITH_MLU
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::MLUPlace>)
.def("_equals",
&IsSamePlace<platform::MLUPlace, platform::CUDAPinnedPlace>)
.def("get_device_id",
[](const platform::MLUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__str__", string::to_string<const platform::MLUPlace &>);
py::class_<platform::Place> platformplace(m, "Place");
g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
platformplace.def(py::init<>())
.def("_type", &PlaceIndex<platform::Place>)
.def("_equals", &IsSamePlace<platform::Place, platform::Place>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::MLUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>)
.def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); })
.def("is_cpu_place",
[](platform::Place &self) { return platform::is_cpu_place(self); })
.def("is_xpu_place",
[](platform::Place &self) { return platform::is_xpu_place(self); })
.def("is_npu_place",
[](platform::Place &self) { return platform::is_npu_place(self); })
.def("is_ipu_place",
[](platform::Place &self) { return platform::is_ipu_place(self); })
.def("is_cuda_pinned_place",
[](platform::Place &self) {
return platform::is_cuda_pinned_place(self);
})
.def("is_mlu_place",
[](platform::Place &self) { return platform::is_mlu_place(self); })
.def(
"is_custom_place",
[](platform::Place &self) { return platform::is_custom_place(self); })
.def("gpu_device_id", [](platform::Place &self) { return self.device; })
.def("xpu_device_id", [](platform::Place &self) { return self.device; })
.def("npu_device_id", [](platform::Place &self) { return self.device; })
.def("ipu_device_id", [](platform::Place &self) { return self.device; })
.def("mlu_device_id", [](platform::Place &self) { return self.device; })
.def("custom_device_id",
[](platform::Place &self) { return self.device; })
.def("set_place",
[](platform::Place &self, const platform::Place &other) {
self = other;
})
.def("set_place",
[](platform::Place &self, const platform::CPUPlace &cpu_place) {
self = cpu_place;
})
.def("set_place",
[](platform::Place &self, const platform::XPUPlace &xpu_place) {
self = xpu_place;
})
.def("set_place",
[](platform::Place &self, const platform::CUDAPlace &gpu_place) {
self = gpu_place;
})
.def("set_place",
[](platform::Place &self,
const platform::CUDAPinnedPlace &cuda_pinned_place) {
self = cuda_pinned_place;
})
.def("set_place",
[](platform::Place &self, const platform::NPUPlace &npu_place) {
self = npu_place;
})
.def("set_place",
[](platform::Place &self, const platform::IPUPlace &ipu_place) {
self = ipu_place;
})
.def("set_place",
[](platform::Place &self, const platform::MLUPlace &mlu_place) {
self = mlu_place;
})
.def("set_place",
[](platform::Place &self, const platform::CustomPlace &plug_place) {
self = plug_place;
})
.def("__repr__", string::to_string<const platform::Place &>)
.def("__str__", string::to_string<const platform::Place &>);
}
} // namespace pybind
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
namespace paddle {
namespace pybind {
void BindPlace(pybind11::module& m); // NOLINT
} // namespace pybind
} // namespace paddle
......@@ -122,9 +122,12 @@ limitations under the License. */
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
#endif
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/pybind/parallel_executor.h"
#include "paddle/fluid/pybind/place.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/tensor.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -194,16 +197,7 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace paddle {
namespace pybind {
PyTypeObject *g_place_pytype = nullptr;
PyTypeObject *g_framework_scope_pytype = nullptr;
PyTypeObject *g_cudaplace_pytype = nullptr;
PyTypeObject *g_cpuplace_pytype = nullptr;
PyTypeObject *g_xpuplace_pytype = nullptr;
PyTypeObject *g_npuplace_pytype = nullptr;
PyTypeObject *g_cudapinnedplace_pytype = nullptr;
PyTypeObject *g_mluplace_pytype = nullptr;
PyTypeObject *g_customplace_pytype = nullptr;
PyTypeObject *g_framework_tensor_pytype = nullptr;
PyTypeObject *g_framework_lodtensorarray_pytype = nullptr;
PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr;
......@@ -349,16 +343,6 @@ bool IsCompiledWithDIST() {
#endif
}
template <typename PlaceType1, typename PlaceType2>
static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
return paddle::platform::Place(p1) == paddle::platform::Place(p2);
}
template <typename PlaceType>
static inline int PlaceIndex(const PlaceType &p) {
return static_cast<int>(paddle::platform::Place(p).GetType());
}
static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) {
// NOTE(zjl): PyObject_GetAttrString would return nullptr when attr_name
// is not inside obj, but it would also set the error flag of Python.
......@@ -541,19 +525,6 @@ static int GetNCCLVersion() {
}
#endif
template <typename PlaceType>
static void TensorCopyFrom(framework::Tensor *dst,
const framework::Tensor &src,
const PlaceType &place,
int64_t batch_size) {
if (batch_size < 0) {
framework::TensorCopy(src, place, dst);
} else {
auto sliced = src.Slice(0, batch_size);
framework::TensorCopy(sliced, place, dst);
}
}
#ifdef PADDLE_WITH_AVX
PYBIND11_MODULE(core_avx, m) {
#else
......@@ -854,897 +825,6 @@ PYBIND11_MODULE(core_noavx, m) {
self.EmplaceBackAttr(attr);
});
py::class_<framework::Tensor> framework_tensor(
m, "Tensor", py::buffer_protocol());
g_framework_tensor_pytype =
reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
framework_tensor
.def("__array__",
[](framework::Tensor &self) { return TensorToPyArray(self); })
.def("_ptr",
[](const framework::Tensor &self) {
return reinterpret_cast<uintptr_t>(self.data());
})
.def("_slice", &framework::Tensor::Slice)
.def("_numel", &framework::Tensor::numel)
.def("_is_initialized",
[](const framework::Tensor &self) { return self.IsInitialized(); })
.def("_get_dims",
[](const framework::Tensor &self) { return vectorize(self.dims()); })
.def("_set_dims",
[](framework::Tensor &self, const std::vector<int64_t> &dim) {
self.Resize(phi::make_ddim(dim));
})
.def("_set_layout",
[](framework::Tensor &self, const std::string &layout) {
self.set_layout(StringToDataLayout(layout));
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::CustomPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::XPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::NPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::MLUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_double",
[](framework::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<double>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::CustomPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::XPUPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::MLUPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self,
paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_float",
[](framework::Tensor &self,
paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<float>(place);
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::CPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::CustomPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::XPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::CUDAPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::CUDAPinnedPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::MLUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_clear", &framework::Tensor::clear)
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::NPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CPUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CustomPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::XPUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CUDAPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::NPUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::MLUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::Place>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("set",
SetTensorFromPyArray<paddle::platform::CPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CustomPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::XPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CUDAPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::NPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::IPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::MLUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false,
R"DOC(
Set the data of Tensor on place with given numpy array.
Args:
lod (numpy.ndarray): The data to set.
place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
Tensor is to be set.
zero_copy (bool, optional): Whether to share memory with the input numpy array.
This parameter only works with CPUPlace. Default: False.
Returns:
None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
)DOC")
.def(
"shape",
[](framework::Tensor &self) { return vectorize(self.dims()); },
R"DOC(
Return the shape of Tensor.
Returns:
list[int]: The shape of Tensor.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
print(t.shape()) # [5, 30]
)DOC")
.def("_to_dlpack",
[](framework::Tensor &self) {
DLPackTensor dlpack_tensor(self, 1);
DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor();
auto capsule = py::capsule(
static_cast<void *>(dmt), "dltensor", [](PyObject *ptr) {
if (ptr) {
auto dltensor = new DLManagedTensor;
try {
dltensor = reinterpret_cast<DLManagedTensor *>(
PyCapsule_GetPointer(ptr, "used_dltensor"));
return;
} catch (...) {
dltensor = reinterpret_cast<DLManagedTensor *>(
PyCapsule_GetPointer(ptr, "dltensor"));
}
dltensor->deleter(dltensor);
}
});
return capsule;
})
.def("_set_float_element", TensorSetElement<float>)
.def("_get_float_element", TensorGetElement<float>)
.def("_set_double_element", TensorSetElement<double>)
.def("_get_double_element", TensorGetElement<double>)
.def("_place", [](framework::Tensor &self) { return self.place(); })
.def("_dtype",
[](framework::Tensor &self) {
return framework::TransToProtoVarType(self.type());
})
.def("_layout",
[](framework::Tensor &self) {
return DataLayoutToString(self.layout());
})
.def("_share_data_with", &framework::Tensor::ShareDataWith)
.def("__getitem__", PySliceTensor, py::return_value_policy::reference)
.def("__str__",
[](const framework::Tensor &self) {
std::stringstream ostr;
ostr << self;
return ostr.str();
}) /* ------ End of original Tensor ------ */
.def("__init__",
[](framework::Tensor &instance,
const std::vector<std::vector<size_t>>
&recursive_sequence_lengths) {
LoD new_lod;
new_lod.reserve(recursive_sequence_lengths.size());
std::copy(recursive_sequence_lengths.begin(),
recursive_sequence_lengths.end(),
std::back_inserter(new_lod));
LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
PADDLE_ENFORCE_EQ(
CheckLoD(new_offset_lod, -1),
true,
platform::errors::InvalidArgument(
"The provided recursive_sequence_lengths info is "
"invalid, "
"the LoD converted by recursive_sequence_lengths is %s",
new_lod));
new (&instance) framework::Tensor(new_offset_lod);
})
.def("__init__",
[](framework::Tensor &instance) {
new (&instance) framework::Tensor();
})
// We implement offset based LOD in C++ while we use length based with
// Python API. So we changed set_lod to set_recursive_sequence_lengths
// to
// avoid misuse.
// The discussion is here:
// https://github.com/PaddlePaddle/Paddle/issues/10855
.def(
"set_lod",
[](framework::Tensor &self,
const std::vector<std::vector<size_t>> &lod) {
// the input lod is offset-based level-of-detail info
LoD new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
PADDLE_ENFORCE_EQ(
CheckLoD(new_lod, vectorize(self.dims()).front()),
true,
platform::errors::InvalidArgument(
"The provided LoD is invalid, the LoD is %s", new_lod));
self.set_lod(new_lod);
},
py::arg("lod"),
R"DOC(
Set LoD of the Tensor.
Args:
lod (list[list[int]]): The lod to set.
Returns:
None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_lod([[0, 2, 5]])
print(t.lod()) # [[0, 2, 5]]
)DOC")
.def(
"set_recursive_sequence_lengths",
[](framework::Tensor &self,
const std::vector<std::vector<size_t>>
&recursive_sequence_lengths) {
// the input recursive_sequence_lengths is length-based
// level-of-detail info
LoD new_lod;
new_lod.reserve(recursive_sequence_lengths.size());
std::copy(recursive_sequence_lengths.begin(),
recursive_sequence_lengths.end(),
std::back_inserter(new_lod));
LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
PADDLE_ENFORCE_EQ(
CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
true,
platform::errors::InvalidArgument(
"The provided recursive_sequence_lengths info is "
"invalid, "
"the LoD converted by recursive_sequence_lengths is "
"%s",
new_lod));
self.set_lod(new_offset_lod);
},
py::arg("recursive_sequence_lengths"),
R"DOC(
Set LoD of the Tensor according to recursive sequence lengths.
For example, if recursive_sequence_lengths=[[2, 3]], which means
there are two sequences with length 2 and 3 respectively, the
corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]].
Args:
recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths.
Returns:
None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_recursive_sequence_lengths([[2, 3]])
print(t.recursive_sequence_lengths()) # [[2, 3]]
print(t.lod()) # [[0, 2, 5]]
)DOC")
.def(
"lod",
[](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
// output the offset-based lod info
LoD lod = self.lod();
std::vector<std::vector<size_t>> new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
return new_lod;
},
R"DOC(
Return the LoD of the Tensor.
Returns:
list[list[int]]: The lod of the Tensor.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_lod([[0, 2, 5]])
print(t.lod()) # [[0, 2, 5]]
)DOC")
// Set above comments of set_lod.
.def(
"recursive_sequence_lengths",
[](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
// output the length-based lod info
LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
std::vector<std::vector<size_t>> new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
return new_lod;
},
R"DOC(
Return the recursive sequence lengths corresponding to of the LodD
of the Tensor.
Returns:
list[list[int]]: The recursive sequence lengths.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_recursive_sequence_lengths([[2, 3]])
print(t.recursive_sequence_lengths()) # [[2, 3]]
)DOC")
.def(
"has_valid_recursive_sequence_lengths",
[](framework::Tensor &self) -> bool {
// Check that the lod info is valid and match the outermost
// dimension of the Tensor data
return CheckLoD(self.lod(), vectorize(self.dims()).front());
},
R"DOC(
Check whether the LoD of the Tensor is valid.
Returns:
bool: Whether the LoD is valid.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_recursive_sequence_lengths([[2, 3]])
print(t.has_valid_recursive_sequence_lengths()) # True
)DOC")
.def("_as_type",
[](const framework::Tensor &self,
paddle::framework::proto::VarType::Type type) {
framework::Tensor dst;
if (self.IsInitialized() && self.numel() > 0) {
TransDataType(self, type, &dst);
}
return dst;
})
.def("_copy",
[](const framework::Tensor &self, const platform::Place &place) {
// follow fetch_op's inplementation
framework::Tensor dst;
if (self.IsInitialized() && self.numel() > 0) {
TensorCopySync(self, place, &dst);
} else {
// Not copy, if the src tensor is empty.
dst.clear();
dst.Resize({0});
}
dst.set_lod(self.lod());
return dst;
#ifdef _WIN32
});
#else
})
#ifdef PADDLE_WITH_CUDA
.def("_share_buffer_with",
[](framework::Tensor &self, const framework::Tensor src,
py::tuple t) {
auto *cuda_ipc_allocation =
dynamic_cast<memory::allocation::CudaIpcAllocation *>(
src.Holder().get());
PADDLE_ENFORCE_NOT_NULL(
cuda_ipc_allocation,
platform::errors::PreconditionNotMet(
"Tensor is not Cuda IPC shared tensor. "
"Now only Tensor shared by cuda ipc could use this "
"api."));
size_t size = t[0].cast<size_t>();
auto dtype =
static_cast<paddle::experimental::DataType>(t[1].cast<int>());
auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
auto lod_info = t[3].cast<framework::LoD>();
auto device_id = t[4].cast<int>();
auto shared_reader_holder =
std::make_shared<memory::allocation::Allocation>(
cuda_ipc_allocation->ptr(),
cuda_ipc_allocation->base_ptr(), size,
platform::CUDAPlace(device_id));
self.ResetHolderWithType(shared_reader_holder, dtype);
self.Resize(dims);
self.set_lod(lod_info);
VLOG(6) << "Reconstructed tensor with buffer shared!";
},
R"DOC(
Deserialize GPU Tensor for existed shared Cuda IPC tensor.
Params:
tensor: Shared Cuda IPC tensor.
tuple: contrains data size, data type,
tensor dims, lod information, device index.
)DOC")
.def("_share_cuda",
[](framework::Tensor self) {
if (!self.IsInitialized() || self.numel() == 0)
throw std::runtime_error(
"Tensor not initialized or numel is 0. could not pass "
"to shared memory. ");
auto *holder = dynamic_cast<memory::allocation::Allocation *>(
self.Holder().get());
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(holder->place()), true,
platform::errors::InvalidArgument(
"Tensor is not on GPU. share_cuda only support GPU "
"Tensor, share_filename is for CPU tensor."));
void *base_ptr = holder->base_ptr();
ptrdiff_t offset_bytes = reinterpret_cast<char *>(holder->ptr()) -
reinterpret_cast<char *>(base_ptr);
cudaIpcMemHandle_t handle;
PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr));
auto _handle = py::bytes(reinterpret_cast<char *>(&handle),
(py::ssize_t)CUDA_IPC_HANDLE_SIZE);
// TODO(ZHUI): use cuda event, to avoid sync.
const auto &device_id = paddle::platform::GetCurrentDeviceId();
auto stream =
paddle::platform::stream::get_current_stream(device_id);
stream->Synchronize();
int type_idx = static_cast<int>(self.type());
size_t data_size =
self.numel() *
framework::SizeOfType(
framework::TransToProtoVarType(self.type()));
return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
type_idx, vectorize(self.dims()), self.lod(),
device_id);
},
R"DOC(
Serialize GPU Tensor by cudaIpcMemHandle.
Returns:
tuple: contrains handle, data size, data type,
tensor dims, lod information, device index.
Examples:
.. code-block:: python
import paddle
tensor = paddle.ones([3,3])
metainfo = tensor.value().get_tensor()._share_cuda()
)DOC")
.def("_new_shared_cuda",
[](py::tuple t) {
if (t.size() != 7)
throw std::runtime_error(
"Invalid Tensor meta info for shared cuda tensor!");
// 1. Create a new C++ instance
framework::Tensor tensor;
// 2. Rebuild Allocation from handle
const std::string &handle = t[0].cast<std::string>();
ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast<int64_t>();
auto device_id = t[6].cast<int>();
auto base_ptr = memory::allocation::GetIpcBasePtr(handle);
size_t size = t[2].cast<size_t>();
void *dev = base_ptr.get();
dev = reinterpret_cast<char *>(dev) + offset_bytes;
auto shared_reader_holder =
std::make_shared<memory::allocation::CudaIpcAllocation>(
dev, size, device_id, std::move(base_ptr));
// 3. Rebuild Tensor
tensor.ResetHolderWithType(
shared_reader_holder,
static_cast<paddle::experimental::DataType>(t[3].cast<int>()));
tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
tensor.set_lod(t[5].cast<framework::LoD>());
return tensor;
},
R"DOC(
Deserialize GPU lod tensor from cudaIpcMemHandle.
Params:
tuple: contrains handle, data size, data type,
tensor dims, lod information, device index.
Examples:
.. code-block:: python
import paddle
tensor = paddle.ones([3,3])
metainfo = tensor.value().get_tensor()._share_cuda()
tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo))
)DOC")
#endif
.def("_share_filename",
[](framework::Tensor &self) {
if (!self.IsInitialized() || self.numel() == 0)
throw std::runtime_error(
"Tensor not initialized or numel is 0. could not pass to "
"shared memory. ");
auto holder = self.Holder();
PADDLE_ENFORCE_EQ(
platform::is_cpu_place(holder->place()) ||
platform::is_cuda_pinned_place(holder->place()),
true, platform::errors::InvalidArgument(
"Tensor is not on CPU. share_filename only "
"support CPU Tensor."));
auto *mmap_allocation = dynamic_cast<
memory::allocation::RefcountedMemoryMapAllocation *>(
holder.get());
// If the tensor is not shared, allocate memory map allocation.
if (mmap_allocation == nullptr) {
void *data_ptr = self.data();
size_t data_size =
self.numel() *
framework::SizeOfType(
framework::TransToProtoVarType(self.type()));
int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_EXCLUSIVE;
std::string handle = memory::allocation::GetIPCName();
auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation(
handle, flags, data_size);
// copy data & reset holder
if (platform::is_cuda_pinned_place(holder->place())) {
#ifdef PADDLE_WITH_CUDA
memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
platform::CUDAPinnedPlace(), data_ptr, data_size);
#endif
} else {
memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
platform::CPUPlace(), data_ptr, data_size);
}
self.ResetHolder(shared_holder);
mmap_allocation = shared_holder.get();
}
int type_idx = static_cast<int>(self.type());
return py::make_tuple(mmap_allocation->ipc_name(),
mmap_allocation->size(), type_idx,
vectorize(self.dims()), self.lod());
},
R"DOC(
Serialize CPU lod tensor in shared memory to tuple.
If the tensor is not in shared memory, we will copy it first.
Returns:
tuple: contrains ipc name, data size, data type,
tensor dims and lod imformation.
Examples:
.. code-block:: python
import paddle
tensor = paddle.ones([3,3])
metainfo = tensor.value().get_tensor()._share_filename()
)DOC")
.def("_new_shared_filename",
[](py::tuple t) { // __setstate__
if (t.size() != 5)
throw std::runtime_error("Invalid Tensor meta info state!");
framework::Tensor tensor;
// 2. Rebuild Allocation
const std::string &ipc_name = t[0].cast<std::string>();
size_t size = t[1].cast<size_t>();
int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_NOCREATE;
auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation(
ipc_name, flags, size);
// 3. Rebuild Tensor
tensor.ResetHolderWithType(
shared_holder,
static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
tensor.set_lod(t[4].cast<framework::LoD>());
return tensor;
},
R"DOC(
Deserialize CPU lod tensor from shared memory.
Params:
tuple: contrains ipc file name, data size, data type,
tensor dims and lod information.
Examples:
.. code-block:: python
import paddle
tensor = paddle.ones([3,3])
metainfo = tensor.value().get_tensor()._share_filename()
tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo))
)DOC")
.def("_shared_incref",
[](framework::Tensor &self) {
auto *mmap_allocation = dynamic_cast<
memory::allocation::RefcountedMemoryMapAllocation *>(
self.Holder().get());
if (mmap_allocation) {
mmap_allocation->incref();
}
},
R"DOC(
Increase reference count of share_filename tensor.
)DOC")
.def("_shared_decref",
[](framework::Tensor &self) {
auto *mmap_allocation = dynamic_cast<
memory::allocation::RefcountedMemoryMapAllocation *>(
self.Holder().get());
if (mmap_allocation) {
mmap_allocation->decref();
}
},
R"DOC(
Decrease reference count of share_filename tensor.
)DOC")
.def(py::pickle(
[](const framework::Tensor &t) { // __getstate__
auto holder = t.Holder();
PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true,
platform::errors::PreconditionNotMet(
"Tensor is not on CPU."
"Now only Tensor on CPU can be serialized."));
auto *mmap_writer_allocation =
dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
holder.get());
PADDLE_ENFORCE_NOT_NULL(
mmap_writer_allocation,
platform::errors::PreconditionNotMet(
"Tensor is not in shared memory."
"Now only Tensor on shared memory can be serialized."));
int type_idx = static_cast<int>(t.type());
return py::make_tuple(mmap_writer_allocation->ipc_name(),
mmap_writer_allocation->size(), type_idx,
vectorize(t.dims()), t.lod());
},
[](py::tuple t) { // __setstate__
if (t.size() != 5)
throw std::runtime_error("Invalid Tensor state!");
// 1. Create a new C++ instance
framework::Tensor tensor;
// 2. Rebuild Allocation
const std::string &ipc_name = t[0].cast<std::string>();
size_t size = t[1].cast<size_t>();
auto shared_reader_holder =
memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name,
size);
// 3. Maintain global fd set
VLOG(3) << "Tensor ipc name: " << ipc_name;
memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
// 4. Rebuild Tensor
tensor.ResetHolderWithType(
shared_reader_holder,
static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
tensor.set_lod(t[4].cast<framework::LoD>());
return tensor;
}));
#endif
py::class_<phi::SelectedRows>(m, "SelectedRows")
.def("__init__",
[](phi::SelectedRows &instance) {
new (&instance) phi::SelectedRows();
})
.def("__init__",
[](phi::SelectedRows &instance,
const std::vector<int64_t> rows,
const int64_t &height) {
new (&instance) phi::SelectedRows(rows, height);
})
.def(
"get_tensor",
[](phi::SelectedRows &self) { return self.mutable_value(); },
py::return_value_policy::reference)
.def("numel",
[](phi::SelectedRows &self) -> int64_t {
return self.value().numel();
})
.def("set_height", &phi::SelectedRows::set_height)
.def("height", &phi::SelectedRows::height)
.def("set_rows",
[](phi::SelectedRows &self, std::vector<int64_t> rows) {
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
self.set_rows(rows);
#else
Vector<int64_t> new_rows(rows);
self.set_rows(new_rows);
#endif
})
.def("sync_index",
[](phi::SelectedRows &instance) { instance.SyncIndex(); })
.def("rows", [](phi::SelectedRows &self) {
auto rows = self.rows();
std::vector<int64_t> new_rows;
new_rows.reserve(rows.size());
std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
return new_rows;
});
py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
All parameter, weight, gradient are variables in Paddle.
......@@ -2272,603 +1352,6 @@ All parameter, weight, gradient are variables in Paddle.
#endif
return devices;
});
py::class_<platform::CustomPlace> customplace(m,
"CustomPlace",
R"DOC(
CustomPlace is a descriptor of a device.
It represents a custom device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
fake_cpu_place = paddle.CustomPlace("FakeCPU", 0)
)DOC");
g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
customplace
.def("__init__",
[](platform::CustomPlace &self,
const std::string &device_type,
int dev_id) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid CustomPlace(%s, %d), device id must be 0 "
"or "
"positive integer",
device_type,
dev_id);
std::exit(-1);
}
if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) &&
phi::DeviceManager::IsCustom(device_type))) {
int dev_count = static_cast<int>(
phi::DeviceManager::GetDeviceCount(device_type));
if (UNLIKELY(dev_id >= dev_count)) {
if (dev_count == 0) {
LOG(ERROR) << "Cannot use " << device_type
<< " because there is no " << device_type
<< " detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid CustomPlace(%s, %d), dev_id must "
"inside "
"[0, %d), because %s "
"number on your machine is %d",
device_type,
dev_id,
dev_count,
device_type,
dev_count);
std::exit(-1);
}
}
new (&self) platform::CustomPlace(device_type, dev_id);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid CustomPlace(%s, %d), the device type is "
"not registered "
"as a custom device.",
device_type,
dev_id);
std::exit(-1);
}
#else
LOG(ERROR) << string::Sprintf(
"Cannot use CustomDevice because you have installed CPU/GPU"
"version PaddlePaddle.\n"
"If you want to use CustomDevice, please try to install"
"CustomDevice version "
"PaddlePaddle by: pip install paddlepaddle\n"
"If you only have CPU, please change "
"CustomPlace(%s, %d) to be CPUPlace().\n",
device_type, dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::CustomPlace>)
.def("get_device_id",
[](const platform::CustomPlace &self) { return self.GetDeviceId(); })
.def("get_device_type",
[](const platform::CustomPlace &self) {
return self.GetDeviceType();
})
.def("__repr__", string::to_string<const platform::CustomPlace &>)
.def("__str__", string::to_string<const platform::CustomPlace &>);
py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
CUDAPlace is a descriptor of a device.
It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace,
staring from 0.
The memory of CUDAPlace with different dev_id is not accessible.
Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card.
You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable.
When the program starts, visible GPU devices will be numbered from 0.
If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default,
and the logical ID is the same as the actual ID.
Parameters:
id (int): GPU device ID.
Examples:
.. code-block:: python
import paddle
place = paddle.CUDAPlace(0)
)DOC");
g_cudaplace_pytype = reinterpret_cast<PyTypeObject *>(cudaplace.ptr());
cudaplace
.def("__init__",
[](platform::CUDAPlace &self, int dev_id) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid CUDAPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) {
if (platform::GetGPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use GPU because there is no GPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
"number on your machine is %d",
dev_id,
platform::GetGPUDeviceCount(),
platform::GetGPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::CUDAPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use GPU because you have installed CPU version "
"PaddlePaddle.\n"
"If you want to use GPU, please try to install GPU version "
"PaddlePaddle by: pip install paddlepaddle-gpu\n"
"If you only have CPU, please change CUDAPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
.def("get_device_id",
[](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
.def("_type", &PlaceIndex<platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::MLUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
.def("_get_device_id",
[](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); })
#endif
.def("__repr__", string::to_string<const platform::CUDAPlace &>)
.def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
**Note**:
Examples:
.. code-block:: python
import paddle.fluid as fluid
xpu_place = fluid.XPUPlace(0)
)DOC");
g_xpuplace_pytype = reinterpret_cast<PyTypeObject *>(xpuplace.ptr());
xpuplace
.def("__init__",
[](platform::XPUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_XPU
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid XPUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
if (platform::GetXPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use XPU because there is no XPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid XPUPlace(%d), must inside [0, %d), because XPU "
"number on your machine is %d",
dev_id,
platform::GetXPUDeviceCount(),
platform::GetXPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::XPUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use XPU because you have installed CPU/GPU version "
"PaddlePaddle.\n"
"If you want to use XPU, please try to install XPU version "
"PaddlePaddle by: pip install paddlepaddle-xpu\n"
"If you only have CPU, please change XPUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
#ifdef PADDLE_WITH_XPU
.def("_type", &PlaceIndex<platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
.def("_equals",
&IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
.def("get_device_id",
[](const platform::XPUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__repr__", string::to_string<const platform::XPUPlace &>)
.def("__str__", string::to_string<const platform::XPUPlace &>);
#ifdef PADDLE_WITH_XPU
py::enum_<phi::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
.value("XPU1", phi::backends::xpu::XPUVersion::XPU1)
.value("XPU2", phi::backends::xpu::XPUVersion::XPU2)
.export_values();
m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
m.def("get_xpu_device_version",
[](int device_id) { return platform::get_xpu_version(device_id); });
#ifdef PADDLE_WITH_XPU_KP
m.def("get_xpu_device_op_support_types",
[](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
return platform::get_xpu_kp_op_support_type(op_name, version);
});
#else
m.def("get_xpu_device_op_support_types",
[](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
return platform::get_xpu_op_support_type(op_name, version);
});
#endif
m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
return platform::get_xpu_op_list(version);
});
m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
// XPUs with Compute Capability > xpu2 support float16 and bfloat16
return platform::get_xpu_version(place.device) >
phi::backends::xpu::XPUVersion::XPU1;
});
m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
// XPUs with Compute Capability > xpu2 support float16 and bfloat16
return platform::get_xpu_version(place.device) >
phi::backends::xpu::XPUVersion::XPU1;
});
#endif
py::class_<paddle::platform::CPUPlace> cpuplace(m, "CPUPlace", R"DOC(
CPUPlace is a descriptor of a device.
It represents a CPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
cpu_place = paddle.CPUPlace()
)DOC");
g_cpuplace_pytype = reinterpret_cast<PyTypeObject *>(cpuplace.ptr());
cpuplace.def(py::init<>())
.def("_type", &PlaceIndex<platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
.def("__repr__", string::to_string<const platform::CPUPlace &>)
.def("__str__", string::to_string<const platform::CPUPlace &>);
py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
m, "CUDAPinnedPlace", R"DOC(
CUDAPinnedPlace is a descriptor of a device.
It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory.
The host operating system will not paging and exchanging the memory.
It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU.
For more information on CUDA data transfer and `pinned memory`,
please refer to `official document <https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#pinned-memory>`_ .
Examples:
.. code-block:: python
import paddle
place = paddle.CUDAPinnedPlace()
)DOC");
g_cudapinnedplace_pytype =
reinterpret_cast<PyTypeObject *>(cudapinnedplace.ptr());
cudapinnedplace
.def("__init__",
[](platform::CUDAPinnedPlace &self) {
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPinnedPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
#endif
new (&self) platform::CUDAPinnedPlace();
})
.def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
.def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
.def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
// NPUPlace
py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
NPUPlace is a descriptor of a device.
It represents a NPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
npu_place = paddle.NPUPlace(0)
)DOC");
g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
npuplace
.def("__init__",
[](platform::NPUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_ASCEND_CL
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
if (platform::GetNPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use NPU because there is no NPU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), must inside [0, %d), because NPU "
"number on your machine is %d",
dev_id,
platform::GetNPUDeviceCount(),
platform::GetNPUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::NPUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use NPU because you have installed CPU/GPU version "
"PaddlePaddle.\n"
"If you want to use NPU, please try to install NPU version "
"PaddlePaddle by: pip install paddlepaddle-npu\n"
"If you only have CPU, please change NPUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
.def("_equals",
&IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
.def("get_device_id",
[](const platform::NPUPlace &self) { return self.GetDeviceId(); })
.def("__str__", string::to_string<const platform::NPUPlace &>);
// IPUPlace
py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
IPUPlace is a descriptor of a device.
It represents a IPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
# required: ipu
ipu_place = paddle.IPUPlace()
)DOC")
.def("__init__",
[](platform::IPUPlace &self) {
#ifdef PADDLE_WITH_IPU
if (platform::GetIPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use IPU because there is no IPU "
"detected on your "
"machine.";
std::exit(-1);
}
// use ipu(0) to comile, while run with the number user configure
// in sharding and pipline.
new (&self) platform::IPUPlace(0);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use IPU because you didn't install IPU version "
"PaddlePaddle.\n"
"If you want to use IPU, please try to install IPU version "
"PaddlePaddle by: pip install paddlepaddle*\n"
"If you only have CPU, please change IPUPlace to be "
"CPUPlace().\n");
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
.def("_equals",
&IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
#ifdef PADDLE_WITH_IPU
.def("get_device_id",
[](const platform::IPUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__str__", string::to_string<const platform::IPUPlace &>);
// MLUPlace
py::class_<platform::MLUPlace> mluplace(m, "MLUPlace", R"DOC(
MLUPlace is a descriptor of a device.
It represents a MLU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
# required: mlu
mlu_place = paddle.MLUPlace(0)
)DOC");
g_mluplace_pytype = reinterpret_cast<PyTypeObject *>(mluplace.ptr());
mluplace
.def("__init__",
[](platform::MLUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_MLU
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid MLUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) {
if (platform::GetMLUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use MLU because there is no MLU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid MLUPlace(%d), must inside [0, %d), because MLU "
"number on your machine is %d",
dev_id,
platform::GetMLUDeviceCount(),
platform::GetMLUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::MLUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use MLU because you have installed CPU/GPU/... "
"version "
"PaddlePaddle.\n"
"If you want to use MLU, please try to install MLU version "
"PaddlePaddle by: pip install paddlepaddle-mlu\n"
"If you only have CPU, please change MLUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::MLUPlace>)
#ifdef PADDLE_WITH_MLU
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::MLUPlace>)
.def("_equals",
&IsSamePlace<platform::MLUPlace, platform::CUDAPinnedPlace>)
.def("get_device_id",
[](const platform::MLUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__str__", string::to_string<const platform::MLUPlace &>);
py::class_<platform::Place> platformplace(m, "Place");
g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
platformplace.def(py::init<>())
.def("_type", &PlaceIndex<platform::Place>)
.def("_equals", &IsSamePlace<platform::Place, platform::Place>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::MLUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>)
.def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); })
.def("is_cpu_place",
[](platform::Place &self) { return platform::is_cpu_place(self); })
.def("is_xpu_place",
[](platform::Place &self) { return platform::is_xpu_place(self); })
.def("is_npu_place",
[](platform::Place &self) { return platform::is_npu_place(self); })
.def("is_ipu_place",
[](platform::Place &self) { return platform::is_ipu_place(self); })
.def("is_cuda_pinned_place",
[](platform::Place &self) {
return platform::is_cuda_pinned_place(self);
})
.def("is_mlu_place",
[](platform::Place &self) { return platform::is_mlu_place(self); })
.def(
"is_custom_place",
[](platform::Place &self) { return platform::is_custom_place(self); })
.def("gpu_device_id", [](platform::Place &self) { return self.device; })
.def("xpu_device_id", [](platform::Place &self) { return self.device; })
.def("npu_device_id", [](platform::Place &self) { return self.device; })
.def("ipu_device_id", [](platform::Place &self) { return self.device; })
.def("mlu_device_id", [](platform::Place &self) { return self.device; })
.def("custom_device_id",
[](platform::Place &self) { return self.device; })
.def("set_place",
[](platform::Place &self, const platform::Place &other) {
self = other;
})
.def("set_place",
[](platform::Place &self, const platform::CPUPlace &cpu_place) {
self = cpu_place;
})
.def("set_place",
[](platform::Place &self, const platform::XPUPlace &xpu_place) {
self = xpu_place;
})
.def("set_place",
[](platform::Place &self, const platform::CUDAPlace &gpu_place) {
self = gpu_place;
})
.def("set_place",
[](platform::Place &self,
const platform::CUDAPinnedPlace &cuda_pinned_place) {
self = cuda_pinned_place;
})
.def("set_place",
[](platform::Place &self, const platform::NPUPlace &npu_place) {
self = npu_place;
})
.def("set_place",
[](platform::Place &self, const platform::IPUPlace &ipu_place) {
self = ipu_place;
})
.def("set_place",
[](platform::Place &self, const platform::MLUPlace &mlu_place) {
self = mlu_place;
})
.def("set_place",
[](platform::Place &self, const platform::CustomPlace &plug_place) {
self = plug_place;
})
.def("__repr__", string::to_string<const platform::Place &>)
.def("__str__", string::to_string<const platform::Place &>);
py::class_<OperatorBase>(m, "Operator")
.def_static("create",
......@@ -3661,927 +2144,6 @@ All parameter, weight, gradient are variables in Paddle.
m.def("clear_executor_cache",
[]() { framework::ExecutorInfoCache::Instance().Finalize(); });
using VarQuantScale =
std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
pass.def(py::init())
.def("has", &ir::Pass::Has)
.def("set_not_owned",
[](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
self.SetNotOwned<ProgramDesc>(attr_name, &attr);
})
.def(
"set",
[](ir::Pass &self, const std::string &name, const std::string &attr) {
self.Set<std::string>(name, new std::string(attr));
})
.def("set",
[](ir::Pass &self, const std::string &name, bool val) {
self.Set<bool>(name, new bool(val));
})
.def("set",
[](ir::Pass &self, const std::string &name, int val) {
self.Set<const int>(name, new int(val));
})
.def("set",
[](ir::Pass &self,
const std::string &name,
std::vector<std::string> set) {
self.Set(name, new std::vector<std::string>(set));
})
.def("set",
[](ir::Pass &self,
const std::string &name,
std::unordered_set<std::string> set) {
self.Set(name, new std::unordered_set<std::string>(set));
})
.def("set",
[](ir::Pass &self,
const std::string &name,
std::unordered_set<int> set) {
self.Set(name, new std::unordered_set<int>(set));
})
.def("set",
[](ir::Pass &self, const std::string &name, VarQuantScale scales) {
self.Set(name, new VarQuantScale(scales));
})
.def("type", &ir::Pass::Type)
.def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
self.Apply(graph.get());
});
py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
m, "PassBuilder");
pb.def(py::init())
.def("append_pass",
[](ir::PassBuilder &self,
const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
return self.AppendPass(pass_type);
})
.def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
.def("insert_pass",
[](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
return self.InsertPass(idx, pass_type);
})
.def("remove_pass",
[](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
// -- python binds for parallel executor.
py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
ExecutionStrategy allows the user to more preciously control how to run
the program in ParallelExecutor by setting the property.
Returns:
ExecutionStrategy: An ExecutionStrategy object.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
import paddle.nn.functional as F
paddle.enable_static()
x = static.data(name='x', shape=[None, 13], dtype='float32')
y = static.data(name='y', shape=[None, 1], dtype='float32')
y_predict = static.nn.fc(input=x, size=1, act=None)
cost = F.square_error_cost(input=y_predict, label=y)
avg_loss = paddle.mean(cost)
sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_loss)
exec_strategy = static.ExecutionStrategy()
exec_strategy.num_threads = 4
train_exe = static.ParallelExecutor(use_cuda=False,
loss_name=avg_loss.name,
exec_strategy=exec_strategy)
)DOC");
py::enum_<paddle::platform::DeviceType>(m, "DeviceType", py::arithmetic())
.value("CPU", paddle::platform::DeviceType::CPU)
.value("CUDA", paddle::platform::DeviceType::CUDA)
.value("XPU", paddle::platform::DeviceType::XPU);
exec_strategy.def(py::init())
.def_property(
"num_threads",
[](const ExecutionStrategy &self) { return self.num_threads_; },
[](ExecutionStrategy &self, size_t num_threads) {
self.num_threads_ = num_threads;
},
R"DOC(
The type is INT, num_threads represents the size of thread pool that
used to run the operators of the current program in ParallelExecutor.
If :math:`num\_threads=1`, all the operators will execute one by one,
but the order maybe difference between iterations.
If it is not set, it will be set in ParallelExecutor according to the
device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
:math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
if it is not set, ParallelExecutor will get the cpu count by calling
`multiprocessing.cpu_count()`. Default 0.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
exec_strategy = static.ExecutionStrategy()
exec_strategy.num_threads = 4
)DOC")
.def_property(
"_use_device",
[](const ExecutionStrategy &self) { return self.use_device_; },
[](ExecutionStrategy &self, paddle::platform::DeviceType use_device) {
self.use_device_ = use_device;
}) // NOTE(liuyuhui): Doesn't add doc for 'use_device', because
// use_device isn‘t exposed to users.
.def_property(
"allow_op_delay",
[](const ExecutionStrategy &self) { return self.allow_op_delay_; },
[](ExecutionStrategy &self, bool allow_op_delay) {
self.allow_op_delay_ = allow_op_delay;
},
R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
communication operators to run, it may make the execution faster.
Note that this option is invalid now, and it will be removed in
next version. Default False.)DOC")
.def_property(
"num_iteration_per_drop_scope",
[](const ExecutionStrategy &self) {
return self.num_iteration_per_drop_scope_;
},
[](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
},
R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
many iterations to clean up the temp variables which
is generated during execution. It may make the execution faster,
because the temp variable's shape maybe the same between two iterations.
Default 100.
.. note::
1. If you fetch data when calling the 'run', the ParallelExecutor
will clean up the temp variables at the end of the current iteration.
2. In some NLP model, it may cause the GPU memory is insufficient,
in this case, you should reduce `num_iteration_per_drop_scope`.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
exec_strategy = static.ExecutionStrategy()
exec_strategy.num_iteration_per_drop_scope = 10
)DOC")
.def_property(
"num_iteration_per_run",
[](const ExecutionStrategy &self) {
return self.num_iteration_per_run_;
},
[](ExecutionStrategy &self, size_t num_iteration_per_run) {
self.num_iteration_per_run_ = num_iteration_per_run;
},
R"DOC(This config that how many iteration the executor will run when
user call exe.run() in python。Default: 1.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
exec_strategy = static.ExecutionStrategy()
exec_strategy.num_iteration_per_run = 10
)DOC")
.def_property(
"use_thread_barrier",
[](const ExecutionStrategy &self) { return self.thread_barrier_; },
[](ExecutionStrategy &self, bool use_thread_barrier) {
self.thread_barrier_ = use_thread_barrier;
},
R"DOC(This config that the this is distributed training with parameter server
)DOC")
.def_property(
"_dry_run",
[](const ExecutionStrategy &self) { return self.dry_run_; },
[](ExecutionStrategy &self, bool dry_run) {
self.dry_run_ = dry_run;
});
exec_strategy.def_property(
"use_experimental_executor",
[](const ExecutionStrategy &self) {
return self.type_ == ExecutionStrategy::kExperimental;
},
[](ExecutionStrategy &self, bool experimental) {
self.type_ = experimental ? ExecutionStrategy::kExperimental
: ExecutionStrategy::kDefault;
});
py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
BuildStrategy allows the user to more preciously control how to
build the SSA Graph in ParallelExecutor by setting the property.
Returns:
BuildStrategy: An BuildStrategy object.
Examples:
.. code-block:: python
import os
import paddle
import paddle.static as static
paddle.enable_static()
os.environ['CPU_NUM'] = str(2)
places = static.cpu_places()
data = static.data(name="x", shape=[None, 1], dtype="float32")
hidden = static.nn.fc(input=data, size=10)
loss = paddle.mean(hidden)
paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
build_strategy = static.BuildStrategy()
build_strategy.enable_inplace = True
build_strategy.memory_optimize = True
build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
program = static.CompiledProgram(static.default_main_program())
program = program.with_data_parallel(loss_name=loss.name,
build_strategy=build_strategy,
places=places)
)DOC");
py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
.value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
.value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce)
.value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce);
py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
"GradientScaleStrategy")
.value("CoeffNumDevice",
BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
.value("One", BuildStrategy::GradientScaleStrategy::kOne)
.value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
build_strategy.def(py::init())
.def("_clear_finalized", &BuildStrategy::ClearFinalized)
.def_property(
"reduce_strategy",
[](const BuildStrategy &self) { return self.reduce_; },
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.reduce_ = strategy;
},
R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce
strategies in ParallelExecutor, AllReduce and Reduce. If you want
that all the parameters' optimization are done on all devices independently,
you should choose AllReduce; otherwise, if you choose Reduce, all the parameters'
optimization will be evenly distributed to different devices, and then
broadcast the optimized parameter to other devices.
Default is 'AllReduce'.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
)DOC")
.def_property(
"gradient_scale_strategy",
[](const BuildStrategy &self) { return self.gradient_scale_; },
[](BuildStrategy &self,
BuildStrategy::GradientScaleStrategy strategy) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.gradient_scale_ = strategy;
},
R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three
ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice,
One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
according to the number of devices. If you want to customize :math:`loss@grad`,
you can choose Customized. Default is 'CoeffNumDevice'.
Examples:
.. code-block:: python
import numpy
import os
import paddle
import paddle.static as static
paddle.enable_static()
use_cuda = True
place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
exe = static.Executor(place)
# NOTE: If you use CPU to run the program, you need
# to specify the CPU_NUM, otherwise, paddle will use
# all the number of the logic core as the CPU_NUM,
# in that case, the batch size of the input should be
# greater than CPU_NUM, if not, the process will be
# failed by an exception.
if not use_cuda:
os.environ['CPU_NUM'] = str(2)
places = static.cpu_places()
else:
places = static.cuda_places()
data = static.data(name='X', shape=[None, 1], dtype='float32')
hidden = static.nn.fc(input=data, size=10)
loss = paddle.mean(hidden)
paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
exe.run(static.default_startup_program())
build_strategy = static.BuildStrategy()
build_strategy.gradient_scale_strategy = \
static.BuildStrategy.GradientScaleStrategy.Customized
compiled_prog = static.CompiledProgram(
static.default_main_program()).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy,
places=places)
dev_count = len(places)
x = numpy.random.random(size=(10, 1)).astype('float32')
loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
loss_grad_name = loss.name+"@GRAD"
loss_data = exe.run(compiled_prog,
feed={"X": x, loss_grad_name : loss_grad},
fetch_list=[loss.name, loss_grad_name])
)DOC")
.def_property(
"debug_graphviz_path",
[](const BuildStrategy &self) { return self.debug_graphviz_path_; },
[](BuildStrategy &self, const std::string &path) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.debug_graphviz_path_ = path;
},
R"DOC((str, optional): debug_graphviz_path indicates the path that
writing the SSA Graph to file in the form of graphviz.
It is useful for debugging. Default is empty string, that is, ""
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.debug_graphviz_path = "./graph"
)DOC")
.def_property(
"enable_sequential_execution",
[](const BuildStrategy &self) {
return self.enable_sequential_execution_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.enable_sequential_execution_ = b;
},
R"DOC((bool, optional): If set True, the execution order of ops would
be the same as what is in the program. Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.enable_sequential_execution = True
)DOC")
.def_property(
"remove_unnecessary_lock",
[](const BuildStrategy &self) {
return self.remove_unnecessary_lock_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.remove_unnecessary_lock_ = b;
},
R"DOC((bool, optional): If set True, some locks in GPU ops would be
released and ParallelExecutor would run faster. Default is True.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.remove_unnecessary_lock = True
)DOC")
.def_property(
"num_trainers",
[](const BuildStrategy &self) { return self.num_trainers_; },
[](BuildStrategy &self, int num_trainers) {
#ifdef WIN32
PADDLE_THROW(platform::errors::Unavailable(
"Distribution mode is not supported on Windows platform."));
#endif
self.num_trainers_ = num_trainers;
})
.def_property(
"trainers_endpoints",
[](const BuildStrategy &self) { return self.trainers_endpoints_; },
[](BuildStrategy &self,
const std::vector<std::string> &trainers_endpoints) {
self.trainers_endpoints_ = trainers_endpoints;
})
.def_property(
"trainer_id",
[](const BuildStrategy &self) { return self.trainer_id_; },
[](BuildStrategy &self, int trainer_id) {
self.trainer_id_ = trainer_id;
})
.def_property(
"nccl_comm_num",
[](const BuildStrategy &self) { return self.nccl_comm_num_; },
[](BuildStrategy &self, int nccl_comm_num) {
self.nccl_comm_num_ = nccl_comm_num;
})
.def_property(
"bkcl_comm_num",
[](const BuildStrategy &self) { return self.bkcl_comm_num_; },
[](BuildStrategy &self, int bkcl_comm_num) {
self.bkcl_comm_num_ = bkcl_comm_num;
})
.def_property(
"use_hierarchical_allreduce",
[](const BuildStrategy &self) {
return self.use_hierarchical_allreduce_;
},
[](BuildStrategy &self, bool use) {
self.use_hierarchical_allreduce_ = use;
})
.def_property(
"hierarchical_allreduce_inter_nranks",
[](const BuildStrategy &self) {
return self.hierarchical_allreduce_inter_nranks_;
},
[](BuildStrategy &self, int nranks) {
self.hierarchical_allreduce_inter_nranks_ = nranks;
})
.def_property(
"fuse_elewise_add_act_ops",
[](const BuildStrategy &self) {
return self.fuse_elewise_add_act_ops_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_elewise_add_act_ops_ = b;
},
R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether
to fuse elementwise_add_op and activation_op,
it may make the execution faster. Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_elewise_add_act_ops = True
)DOC")
.def_property(
"fuse_gemm_epilogue",
[](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_gemm_epilogue_ = b;
},
R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
to fuse matmul_op, elemenewist_add_op and activation_op,
it may make the execution faster. Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_gemm_epilogue = True
)DOC")
.def_property(
"fuse_bn_act_ops",
[](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_bn_act_ops_ = b;
},
R"DOC((bool, optional): fuse_bn_act_ops indicate whether
to fuse batch_norm and activation_op,
it may make the execution faster. Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_bn_act_ops = True
)DOC")
.def_property(
"fuse_bn_add_act_ops",
[](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_bn_add_act_ops_ = b;
},
R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether
to fuse batch_norm, elementwise_add and activation_op,
it may make the execution faster. Default is True
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_bn_add_act_ops = True
)DOC")
.def_property(
"enable_auto_fusion",
[](const BuildStrategy &self) { return self.enable_auto_fusion_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.enable_auto_fusion_ = b;
},
R"DOC((bool, optional): Whether to enable fusing subgraph to a
fusion_group. Now we only support fusing subgraph that composed
of elementwise-like operators, such as elementwise_add/mul
without broadcast and activations.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.enable_auto_fusion = True
)DOC")
.def_property(
"fuse_relu_depthwise_conv",
[](const BuildStrategy &self) {
return self.fuse_relu_depthwise_conv_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.fuse_relu_depthwise_conv_ = b;
},
R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether
to fuse relu and depthwise_conv2d,
it will save GPU memory and may make the execution faster.
This options is only available in GPU devices.
Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_relu_depthwise_conv = True
)DOC")
.def_property(
"fuse_broadcast_ops",
[](const BuildStrategy &self) {
return self.fuse_broadcast_ops_ == true ||
self.fuse_broadcast_ops_ == paddle::none;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, "
"cannot be configured again."));
self.fuse_broadcast_ops_ = b;
},
R"DOC((bool, optional): fuse_broadcast_op indicates whether
to fuse the broadcast ops. Note that, in Reduce mode,
fusing broadcast ops may make the program faster. Because
fusing broadcast OP equals delaying the execution of all
broadcast Ops, in this case, all nccl streams are used only
for NCCLReduce operations for a period of time. Default False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.fuse_broadcast_ops = True
)DOC")
.def_property(
"fuse_all_optimizer_ops",
[](const BuildStrategy &self) {
return self.fuse_all_optimizer_ops_ == true ||
self.fuse_all_optimizer_ops_ == paddle::none;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, "
"cannot be configured again."));
self.fuse_all_optimizer_ops_ = b;
})
.def_property(
"sync_batch_norm",
[](const BuildStrategy &self) { return self.sync_batch_norm_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
self.sync_batch_norm_ = b;
},
R"DOC((bool, optional): sync_batch_norm indicates whether to use
synchronous batch normalization which synchronizes the mean
and variance through multi-devices in training phase.
Current implementation doesn't support FP16 training and CPU.
And only synchronous on one machine, not all machines.
Default is False.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.sync_batch_norm = True
)DOC")
.def_property(
"memory_optimize",
[](const BuildStrategy &self) -> py::object {
if (self.memory_optimize_) {
return py::cast(self.memory_optimize_.get());
} else {
return py::cast(nullptr);
}
},
[](BuildStrategy &self, const py::handle &value) {
auto *py_obj = value.ptr();
if (py_obj == nullptr || py_obj == Py_None) {
self.memory_optimize_ = paddle::none;
} else if (PyBool_Check(py_obj)) {
self.memory_optimize_ = (py_obj == Py_True);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"BuildStrategy.memory_optimize must be set to None, False "
"or True"));
}
},
R"DOC((bool, optional): memory opitimize aims to save total memory
consumption, set to True to enable it.
Default None. None means framework would choose to use or not use
this strategy automatically. Currently, None means that it is
enabled when GC is disabled, and disabled when GC is enabled.
True means enabling and False means disabling. Default is None.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
paddle.enable_static()
build_strategy = static.BuildStrategy()
build_strategy.memory_optimize = True
)DOC")
.def_property(
"is_distribution",
[](const BuildStrategy &self) { return self.is_distribution_; },
[](BuildStrategy &self, bool b) {
#ifdef WIN32
if (b) {
PADDLE_THROW(platform::errors::Unavailable(
"Distribution mode is not supported on Windows platform."));
}
#else
self.is_distribution_ = b;
#endif
})
.def_property(
"async_mode",
[](const BuildStrategy &self) { return self.async_mode_; },
[](BuildStrategy &self, bool b) { self.async_mode_ = b; })
.def_property(
"enable_inplace",
[](const BuildStrategy &self) { return self.enable_inplace_; },
[](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
.def_property(
"enable_addto",
[](const BuildStrategy &self) { return self.enable_addto_; },
[](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
.def_property(
"fuse_all_reduce_ops",
[](const BuildStrategy &self) {
return self.fuse_all_reduce_ops_ == true ||
self.fuse_all_reduce_ops_ == paddle::none;
},
[](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
.def_property(
"enable_backward_optimizer_op_deps",
[](const BuildStrategy &self) {
return self.enable_backward_optimizer_op_deps_;
},
[](BuildStrategy &self, bool b) {
self.enable_backward_optimizer_op_deps_ = b;
})
.def_property(
"cache_runtime_context",
[](const BuildStrategy &self) { return self.cache_runtime_context_; },
[](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
.def_property(
"mkldnn_enabled_op_types",
[](const BuildStrategy &self) {
return self.mkldnn_enabled_op_types_;
},
[](BuildStrategy &self,
const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
})
.def_property(
"fix_op_run_order",
[](const BuildStrategy &self) { return self.fix_op_run_order_; },
[](BuildStrategy &self, bool fix_op_run_order) {
self.fix_op_run_order_ = fix_op_run_order;
})
.def_property(
"allow_cuda_graph_capture",
[](const BuildStrategy &self) {
return self.allow_cuda_graph_capture_;
},
[](BuildStrategy &self, bool allow_cuda_graph_capture) {
self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
})
.def("_copy",
[](const BuildStrategy &self) {
auto new_bs = self;
new_bs.ClearFinalized();
return new_bs;
})
.def(
"_finalize_strategy_and_create_passes",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy(true);
},
R"DOC(Allow user to customized passes. Normally model-specific
optimization passes should be defined in this way. BuildStrategy
cannot be updated after being finalized.)DOC");
m.def("_set_cached_executor_build_strategy",
[](int64_t program_id, const BuildStrategy &build_strategy) {
auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
cached_exe_info.SetBuildStrategy(program_id, build_strategy);
});
pe.def(py::init<const std::vector<platform::Place> &,
const std::vector<std::string> &,
const std::string &,
Scope *,
std::vector<Scope *> &,
const ExecutionStrategy &,
const BuildStrategy &,
ir::Graph *>())
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
// one by one and mark them as reference.
.def(
"local_scopes",
[](ParallelExecutor &self) -> std::vector<Scope *> * {
return &self.GetLocalScopes();
},
py::return_value_policy::reference)
.def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
.def("_need_create_local_exe_scopes",
&ParallelExecutor::NeedCreateLocalExeScope)
.def("feed_tensors_into_local_scopes",
&ParallelExecutor::FeedTensorsIntoLocalScopes)
.def("feed_and_split_tensor_into_local_scopes",
&ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
.def("run",
[](ParallelExecutor &self,
const std::vector<std::string> &fetch_tensors,
bool return_merged) -> py::object {
if (return_merged) {
paddle::framework::FetchList ret;
/*gil_scoped_release*/ {
pybind11::gil_scoped_release release;
ret = self.RunAndMerge(fetch_tensors);
}
return py::cast(std::move(ret));
} else {
paddle::framework::FetchUnmergedList ret;
/*gil_scoped_release*/ {
pybind11::gil_scoped_release release;
ret = self.Run(fetch_tensors);
}
return py::cast(std::move(ret));
}
})
.def("device_count", &ParallelExecutor::DeviceCount);
#ifdef PADDLE_WITH_IPU
py::class_<platform::ipu::IpuBackend,
std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>>(
......@@ -4790,6 +2352,9 @@ All parameter, weight, gradient are variables in Paddle.
BindFleetWrapper(&m);
BindIO(&m);
BindParallelExecutor(m);
BindPlace(m);
BindTensor(m);
#if defined(PADDLE_WITH_PSLIB) && !defined(PADDLE_WITH_HETERPS)
BindHeterWrapper(&m);
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <Python.h>
#include <algorithm>
#include <cctype>
#include <cstdlib>
#include <iterator>
#include <map>
#include <memory>
#include <mutex> // NOLINT // for call_once
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/custom_operator.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor_cache.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
#include "paddle/fluid/framework/ir/cost_model.h"
#include "paddle/fluid/framework/ir/generate_pass.h"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/new_executor/executor_statistics.h"
#include "paddle/fluid/framework/new_executor/standalone_executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/save_load_util.h"
#include "paddle/fluid/framework/scope_pool.h"
#include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/trainer.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
#endif
#include "paddle/fluid/memory/allocation/mmap_allocator.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/profiler.h"
#include "paddle/fluid/pybind/cuda_streams_py.h"
#include "paddle/fluid/pybind/distributed_py.h"
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
#ifdef PADDLE_WITH_ASCEND
#include "paddle/fluid/pybind/ascend_wrapper_py.h"
#endif
#include "paddle/fluid/pybind/bind_cost_model.h"
#include "paddle/fluid/pybind/bind_fleet_executor.h"
#include "paddle/fluid/pybind/box_helper_py.h"
#include "paddle/fluid/pybind/communication.h"
#include "paddle/fluid/pybind/compatible.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/data_set_py.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/fleet_wrapper_py.h"
#include "paddle/fluid/pybind/generator_py.h"
#include "paddle/fluid/pybind/global_value_getter_setter.h"
#include "paddle/fluid/pybind/gloo_context_py.h"
#include "paddle/fluid/pybind/gloo_wrapper_py.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
#include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/metrics_py.h"
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#include "paddle/phi/backends/device_manager.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
#endif
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#ifndef PADDLE_WITH_HIP
#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
#endif
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/capi/capi.h"
#endif
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
#endif
#if defined PADDLE_WITH_PSCORE
#include "paddle/fluid/pybind/fleet_py.h"
#endif
#ifdef PADDLE_WITH_CINN
#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
#endif
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/imperative/layout_autotune.h"
#include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/fluid/pybind/tensor.h"
#include "paddle/phi/api/ext/op_meta_info.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "pybind11/stl.h"
DECLARE_bool(use_mkldnn);
// disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace paddle {
namespace pybind {
PyTypeObject *g_framework_tensor_pytype = nullptr;
template <typename PlaceType>
static void TensorCopyFrom(framework::Tensor *dst,
const framework::Tensor &src,
const PlaceType &place,
int64_t batch_size) {
if (batch_size < 0) {
framework::TensorCopy(src, place, dst);
} else {
auto sliced = src.Slice(0, batch_size);
framework::TensorCopy(sliced, place, dst);
}
}
void BindTensor(pybind11::module &m) { // NOLINT
using namespace paddle::framework; // NOLINT
py::class_<framework::Tensor> framework_tensor(
m, "Tensor", py::buffer_protocol());
g_framework_tensor_pytype =
reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
framework_tensor
.def("__array__",
[](framework::Tensor &self) { return TensorToPyArray(self); })
.def("_ptr",
[](const framework::Tensor &self) {
return reinterpret_cast<uintptr_t>(self.data());
})
.def("_slice", &framework::Tensor::Slice)
.def("_numel", &framework::Tensor::numel)
.def("_is_initialized",
[](const framework::Tensor &self) { return self.IsInitialized(); })
.def("_get_dims",
[](const framework::Tensor &self) { return vectorize(self.dims()); })
.def("_set_dims",
[](framework::Tensor &self, const std::vector<int64_t> &dim) {
self.Resize(phi::make_ddim(dim));
})
.def("_set_layout",
[](framework::Tensor &self, const std::string &layout) {
self.set_layout(StringToDataLayout(layout));
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::CustomPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::XPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::NPUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_float",
[](framework::Tensor &self, paddle::platform::MLUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_double",
[](framework::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<double>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::CustomPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::XPUPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self, paddle::platform::MLUPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_int",
[](framework::Tensor &self,
paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<int>(place);
})
.def("_alloc_float",
[](framework::Tensor &self,
paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<float>(place);
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::CPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::CustomPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::XPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::CUDAPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::CUDAPinnedPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::MLUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_clear", &framework::Tensor::clear)
.def("_mutable_data",
[](framework::Tensor &self,
paddle::platform::NPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CPUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CustomPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::XPUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CUDAPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::NPUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::MLUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::Place>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("set",
SetTensorFromPyArray<paddle::platform::CPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CustomPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::XPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CUDAPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::NPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::IPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::MLUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false,
R"DOC(
Set the data of Tensor on place with given numpy array.
Args:
lod (numpy.ndarray): The data to set.
place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
Tensor is to be set.
zero_copy (bool, optional): Whether to share memory with the input numpy array.
This parameter only works with CPUPlace. Default: False.
Returns:
None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
)DOC")
.def(
"shape",
[](framework::Tensor &self) { return vectorize(self.dims()); },
R"DOC(
Return the shape of Tensor.
Returns:
list[int]: The shape of Tensor.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
print(t.shape()) # [5, 30]
)DOC")
.def("_to_dlpack",
[](framework::Tensor &self) {
DLPackTensor dlpack_tensor(self, 1);
DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor();
auto capsule = py::capsule(
static_cast<void *>(dmt), "dltensor", [](PyObject *ptr) {
if (ptr) {
auto dltensor = new DLManagedTensor;
try {
dltensor = reinterpret_cast<DLManagedTensor *>(
PyCapsule_GetPointer(ptr, "used_dltensor"));
return;
} catch (...) {
dltensor = reinterpret_cast<DLManagedTensor *>(
PyCapsule_GetPointer(ptr, "dltensor"));
}
dltensor->deleter(dltensor);
}
});
return capsule;
})
.def("_set_float_element", TensorSetElement<float>)
.def("_get_float_element", TensorGetElement<float>)
.def("_set_double_element", TensorSetElement<double>)
.def("_get_double_element", TensorGetElement<double>)
.def("_place", [](framework::Tensor &self) { return self.place(); })
.def("_dtype",
[](framework::Tensor &self) {
return framework::TransToProtoVarType(self.type());
})
.def("_layout",
[](framework::Tensor &self) {
return DataLayoutToString(self.layout());
})
.def("_share_data_with", &framework::Tensor::ShareDataWith)
.def("__getitem__", PySliceTensor, py::return_value_policy::reference)
.def("__str__",
[](const framework::Tensor &self) {
std::stringstream ostr;
ostr << self;
return ostr.str();
}) /* ------ End of original Tensor ------ */
.def("__init__",
[](framework::Tensor &instance,
const std::vector<std::vector<size_t>>
&recursive_sequence_lengths) {
LoD new_lod;
new_lod.reserve(recursive_sequence_lengths.size());
std::copy(recursive_sequence_lengths.begin(),
recursive_sequence_lengths.end(),
std::back_inserter(new_lod));
LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
PADDLE_ENFORCE_EQ(
CheckLoD(new_offset_lod, -1),
true,
platform::errors::InvalidArgument(
"The provided recursive_sequence_lengths info is "
"invalid, "
"the LoD converted by recursive_sequence_lengths is %s",
new_lod));
new (&instance) framework::Tensor(new_offset_lod);
})
.def("__init__",
[](framework::Tensor &instance) {
new (&instance) framework::Tensor();
})
// We implement offset based LOD in C++ while we use length based with
// Python API. So we changed set_lod to set_recursive_sequence_lengths
// to
// avoid misuse.
// The discussion is here:
// https://github.com/PaddlePaddle/Paddle/issues/10855
.def(
"set_lod",
[](framework::Tensor &self,
const std::vector<std::vector<size_t>> &lod) {
// the input lod is offset-based level-of-detail info
LoD new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
PADDLE_ENFORCE_EQ(
CheckLoD(new_lod, vectorize(self.dims()).front()),
true,
platform::errors::InvalidArgument(
"The provided LoD is invalid, the LoD is %s", new_lod));
self.set_lod(new_lod);
},
py::arg("lod"),
R"DOC(
Set LoD of the Tensor.
Args:
lod (list[list[int]]): The lod to set.
Returns:
None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_lod([[0, 2, 5]])
print(t.lod()) # [[0, 2, 5]]
)DOC")
.def(
"set_recursive_sequence_lengths",
[](framework::Tensor &self,
const std::vector<std::vector<size_t>>
&recursive_sequence_lengths) {
// the input recursive_sequence_lengths is length-based
// level-of-detail info
LoD new_lod;
new_lod.reserve(recursive_sequence_lengths.size());
std::copy(recursive_sequence_lengths.begin(),
recursive_sequence_lengths.end(),
std::back_inserter(new_lod));
LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
PADDLE_ENFORCE_EQ(
CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
true,
platform::errors::InvalidArgument(
"The provided recursive_sequence_lengths info is "
"invalid, "
"the LoD converted by recursive_sequence_lengths is "
"%s",
new_lod));
self.set_lod(new_offset_lod);
},
py::arg("recursive_sequence_lengths"),
R"DOC(
Set LoD of the Tensor according to recursive sequence lengths.
For example, if recursive_sequence_lengths=[[2, 3]], which means
there are two sequences with length 2 and 3 respectively, the
corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]].
Args:
recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths.
Returns:
None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_recursive_sequence_lengths([[2, 3]])
print(t.recursive_sequence_lengths()) # [[2, 3]]
print(t.lod()) # [[0, 2, 5]]
)DOC")
.def(
"lod",
[](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
// output the offset-based lod info
LoD lod = self.lod();
std::vector<std::vector<size_t>> new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
return new_lod;
},
R"DOC(
Return the LoD of the Tensor.
Returns:
list[list[int]]: The lod of the Tensor.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_lod([[0, 2, 5]])
print(t.lod()) # [[0, 2, 5]]
)DOC")
// Set above comments of set_lod.
.def(
"recursive_sequence_lengths",
[](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
// output the length-based lod info
LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
std::vector<std::vector<size_t>> new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
return new_lod;
},
R"DOC(
Return the recursive sequence lengths corresponding to of the LodD
of the Tensor.
Returns:
list[list[int]]: The recursive sequence lengths.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_recursive_sequence_lengths([[2, 3]])
print(t.recursive_sequence_lengths()) # [[2, 3]]
)DOC")
.def(
"has_valid_recursive_sequence_lengths",
[](framework::Tensor &self) -> bool {
// Check that the lod info is valid and match the outermost
// dimension of the Tensor data
return CheckLoD(self.lod(), vectorize(self.dims()).front());
},
R"DOC(
Check whether the LoD of the Tensor is valid.
Returns:
bool: Whether the LoD is valid.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy as np
t = fluid.Tensor()
t.set(np.ndarray([5, 30]), fluid.CPUPlace())
t.set_recursive_sequence_lengths([[2, 3]])
print(t.has_valid_recursive_sequence_lengths()) # True
)DOC")
.def("_as_type",
[](const framework::Tensor &self,
paddle::framework::proto::VarType::Type type) {
framework::Tensor dst;
if (self.IsInitialized() && self.numel() > 0) {
TransDataType(self, type, &dst);
}
return dst;
})
.def("_copy",
[](const framework::Tensor &self, const platform::Place &place) {
// follow fetch_op's inplementation
framework::Tensor dst;
if (self.IsInitialized() && self.numel() > 0) {
TensorCopySync(self, place, &dst);
} else {
// Not copy, if the src tensor is empty.
dst.clear();
dst.Resize({0});
}
dst.set_lod(self.lod());
return dst;
#ifdef _WIN32
});
#else
})
#ifdef PADDLE_WITH_CUDA
.def("_share_buffer_with",
[](framework::Tensor &self, const framework::Tensor src,
py::tuple t) {
auto *cuda_ipc_allocation =
dynamic_cast<memory::allocation::CudaIpcAllocation *>(
src.Holder().get());
PADDLE_ENFORCE_NOT_NULL(
cuda_ipc_allocation,
platform::errors::PreconditionNotMet(
"Tensor is not Cuda IPC shared tensor. "
"Now only Tensor shared by cuda ipc could use this "
"api."));
size_t size = t[0].cast<size_t>();
auto dtype =
static_cast<paddle::experimental::DataType>(t[1].cast<int>());
auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
auto lod_info = t[3].cast<framework::LoD>();
auto device_id = t[4].cast<int>();
auto shared_reader_holder =
std::make_shared<memory::allocation::Allocation>(
cuda_ipc_allocation->ptr(),
cuda_ipc_allocation->base_ptr(), size,
platform::CUDAPlace(device_id));
self.ResetHolderWithType(shared_reader_holder, dtype);
self.Resize(dims);
self.set_lod(lod_info);
VLOG(6) << "Reconstructed tensor with buffer shared!";
},
R"DOC(
Deserialize GPU Tensor for existed shared Cuda IPC tensor.
Params:
tensor: Shared Cuda IPC tensor.
tuple: contrains data size, data type,
tensor dims, lod information, device index.
)DOC")
.def("_share_cuda",
[](framework::Tensor self) {
if (!self.IsInitialized() || self.numel() == 0)
throw std::runtime_error(
"Tensor not initialized or numel is 0. could not pass "
"to shared memory. ");
auto *holder = dynamic_cast<memory::allocation::Allocation *>(
self.Holder().get());
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(holder->place()), true,
platform::errors::InvalidArgument(
"Tensor is not on GPU. share_cuda only support GPU "
"Tensor, share_filename is for CPU tensor."));
void *base_ptr = holder->base_ptr();
ptrdiff_t offset_bytes = reinterpret_cast<char *>(holder->ptr()) -
reinterpret_cast<char *>(base_ptr);
cudaIpcMemHandle_t handle;
PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr));
auto _handle = py::bytes(reinterpret_cast<char *>(&handle),
(py::ssize_t)CUDA_IPC_HANDLE_SIZE);
// TODO(ZHUI): use cuda event, to avoid sync.
const auto &device_id = paddle::platform::GetCurrentDeviceId();
auto stream =
paddle::platform::stream::get_current_stream(device_id);
stream->Synchronize();
int type_idx = static_cast<int>(self.type());
size_t data_size =
self.numel() *
framework::SizeOfType(
framework::TransToProtoVarType(self.type()));
return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
type_idx, vectorize(self.dims()), self.lod(),
device_id);
},
R"DOC(
Serialize GPU Tensor by cudaIpcMemHandle.
Returns:
tuple: contrains handle, data size, data type,
tensor dims, lod information, device index.
Examples:
.. code-block:: python
import paddle
tensor = paddle.ones([3,3])
metainfo = tensor.value().get_tensor()._share_cuda()
)DOC")
.def("_new_shared_cuda",
[](py::tuple t) {
if (t.size() != 7)
throw std::runtime_error(
"Invalid Tensor meta info for shared cuda tensor!");
// 1. Create a new C++ instance
framework::Tensor tensor;
// 2. Rebuild Allocation from handle
const std::string &handle = t[0].cast<std::string>();
ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast<int64_t>();
auto device_id = t[6].cast<int>();
auto base_ptr = memory::allocation::GetIpcBasePtr(handle);
size_t size = t[2].cast<size_t>();
void *dev = base_ptr.get();
dev = reinterpret_cast<char *>(dev) + offset_bytes;
auto shared_reader_holder =
std::make_shared<memory::allocation::CudaIpcAllocation>(
dev, size, device_id, std::move(base_ptr));
// 3. Rebuild Tensor
tensor.ResetHolderWithType(
shared_reader_holder,
static_cast<paddle::experimental::DataType>(t[3].cast<int>()));
tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
tensor.set_lod(t[5].cast<framework::LoD>());
return tensor;
},
R"DOC(
Deserialize GPU lod tensor from cudaIpcMemHandle.
Params:
tuple: contrains handle, data size, data type,
tensor dims, lod information, device index.
Examples:
.. code-block:: python
import paddle
tensor = paddle.ones([3,3])
metainfo = tensor.value().get_tensor()._share_cuda()
tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo))
)DOC")
#endif
.def("_share_filename",
[](framework::Tensor &self) {
if (!self.IsInitialized() || self.numel() == 0)
throw std::runtime_error(
"Tensor not initialized or numel is 0. could not pass to "
"shared memory. ");
auto holder = self.Holder();
PADDLE_ENFORCE_EQ(
platform::is_cpu_place(holder->place()) ||
platform::is_cuda_pinned_place(holder->place()),
true, platform::errors::InvalidArgument(
"Tensor is not on CPU. share_filename only "
"support CPU Tensor."));
auto *mmap_allocation = dynamic_cast<
memory::allocation::RefcountedMemoryMapAllocation *>(
holder.get());
// If the tensor is not shared, allocate memory map allocation.
if (mmap_allocation == nullptr) {
void *data_ptr = self.data();
size_t data_size =
self.numel() *
framework::SizeOfType(
framework::TransToProtoVarType(self.type()));
int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_EXCLUSIVE;
std::string handle = memory::allocation::GetIPCName();
auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation(
handle, flags, data_size);
// copy data & reset holder
if (platform::is_cuda_pinned_place(holder->place())) {
#ifdef PADDLE_WITH_CUDA
memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
platform::CUDAPinnedPlace(), data_ptr, data_size);
#endif
} else {
memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
platform::CPUPlace(), data_ptr, data_size);
}
self.ResetHolder(shared_holder);
mmap_allocation = shared_holder.get();
}
int type_idx = static_cast<int>(self.type());
return py::make_tuple(mmap_allocation->ipc_name(),
mmap_allocation->size(), type_idx,
vectorize(self.dims()), self.lod());
},
R"DOC(
Serialize CPU lod tensor in shared memory to tuple.
If the tensor is not in shared memory, we will copy it first.
Returns:
tuple: contrains ipc name, data size, data type,
tensor dims and lod imformation.
Examples:
.. code-block:: python
import paddle
tensor = paddle.ones([3,3])
metainfo = tensor.value().get_tensor()._share_filename()
)DOC")
.def("_new_shared_filename",
[](py::tuple t) { // __setstate__
if (t.size() != 5)
throw std::runtime_error("Invalid Tensor meta info state!");
framework::Tensor tensor;
// 2. Rebuild Allocation
const std::string &ipc_name = t[0].cast<std::string>();
size_t size = t[1].cast<size_t>();
int flags = memory::allocation::MAPPED_SHAREDMEM |
memory::allocation::MAPPED_NOCREATE;
auto shared_holder =
memory::allocation::AllocateRefcountedMemoryMapAllocation(
ipc_name, flags, size);
// 3. Rebuild Tensor
tensor.ResetHolderWithType(
shared_holder,
static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
tensor.set_lod(t[4].cast<framework::LoD>());
return tensor;
},
R"DOC(
Deserialize CPU lod tensor from shared memory.
Params:
tuple: contrains ipc file name, data size, data type,
tensor dims and lod information.
Examples:
.. code-block:: python
import paddle
tensor = paddle.ones([3,3])
metainfo = tensor.value().get_tensor()._share_filename()
tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo))
)DOC")
.def("_shared_incref",
[](framework::Tensor &self) {
auto *mmap_allocation = dynamic_cast<
memory::allocation::RefcountedMemoryMapAllocation *>(
self.Holder().get());
if (mmap_allocation) {
mmap_allocation->incref();
}
},
R"DOC(
Increase reference count of share_filename tensor.
)DOC")
.def("_shared_decref",
[](framework::Tensor &self) {
auto *mmap_allocation = dynamic_cast<
memory::allocation::RefcountedMemoryMapAllocation *>(
self.Holder().get());
if (mmap_allocation) {
mmap_allocation->decref();
}
},
R"DOC(
Decrease reference count of share_filename tensor.
)DOC")
.def(py::pickle(
[](const framework::Tensor &t) { // __getstate__
auto holder = t.Holder();
PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true,
platform::errors::PreconditionNotMet(
"Tensor is not on CPU."
"Now only Tensor on CPU can be serialized."));
auto *mmap_writer_allocation =
dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
holder.get());
PADDLE_ENFORCE_NOT_NULL(
mmap_writer_allocation,
platform::errors::PreconditionNotMet(
"Tensor is not in shared memory."
"Now only Tensor on shared memory can be serialized."));
int type_idx = static_cast<int>(t.type());
return py::make_tuple(mmap_writer_allocation->ipc_name(),
mmap_writer_allocation->size(), type_idx,
vectorize(t.dims()), t.lod());
},
[](py::tuple t) { // __setstate__
if (t.size() != 5)
throw std::runtime_error("Invalid Tensor state!");
// 1. Create a new C++ instance
framework::Tensor tensor;
// 2. Rebuild Allocation
const std::string &ipc_name = t[0].cast<std::string>();
size_t size = t[1].cast<size_t>();
auto shared_reader_holder =
memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name,
size);
// 3. Maintain global fd set
VLOG(3) << "Tensor ipc name: " << ipc_name;
memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
// 4. Rebuild Tensor
tensor.ResetHolderWithType(
shared_reader_holder,
static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
tensor.set_lod(t[4].cast<framework::LoD>());
return tensor;
}));
#endif
py::class_<phi::SelectedRows>(m, "SelectedRows")
.def("__init__",
[](phi::SelectedRows &instance) {
new (&instance) phi::SelectedRows();
})
.def("__init__",
[](phi::SelectedRows &instance,
const std::vector<int64_t> rows,
const int64_t &height) {
new (&instance) phi::SelectedRows(rows, height);
})
.def(
"get_tensor",
[](phi::SelectedRows &self) { return self.mutable_value(); },
py::return_value_policy::reference)
.def("numel",
[](phi::SelectedRows &self) -> int64_t {
return self.value().numel();
})
.def("set_height", &phi::SelectedRows::set_height)
.def("height", &phi::SelectedRows::height)
.def("set_rows",
[](phi::SelectedRows &self, std::vector<int64_t> rows) {
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
self.set_rows(rows);
#else
Vector<int64_t> new_rows(rows);
self.set_rows(new_rows);
#endif
})
.def("sync_index",
[](phi::SelectedRows &instance) { instance.SyncIndex(); })
.def("rows", [](phi::SelectedRows &self) {
auto rows = self.rows();
std::vector<int64_t> new_rows;
new_rows.reserve(rows.size());
std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
return new_rows;
});
}
} // namespace pybind
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
namespace paddle {
namespace pybind {
void BindTensor(pybind11::module& m); // NOLINT
} // namespace pybind
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册