“cde6241a1bc1c4c42d3991d5a394f7f1398c702d”上不存在“python/paddle/fluid/tests/unittests/test_broadcast.py”
提交 7ab3f36e 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

...@@ -100,7 +100,6 @@ function(kernel_library TARGET) ...@@ -100,7 +100,6 @@ function(kernel_library TARGET)
set(xpu_srcs) set(xpu_srcs)
set(gpudnn_srcs) set(gpudnn_srcs)
set(kps_srcs) set(kps_srcs)
set(selected_rows_srcs)
# parse and save the deps kerenl targets # parse and save the deps kerenl targets
set(all_srcs) set(all_srcs)
set(kernel_deps) set(kernel_deps)
...@@ -111,6 +110,12 @@ function(kernel_library TARGET) ...@@ -111,6 +110,12 @@ function(kernel_library TARGET)
cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}" cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN}) "${multiValueArgs}" ${ARGN})
# used for cc_library selected_rows dir target
set(target_suffix "")
if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows_kernel")
set(target_suffix "_sr")
endif()
list(LENGTH kernel_library_SRCS kernel_library_SRCS_len) list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
# one kernel only match one impl file in each backend # one kernel only match one impl file in each backend
...@@ -121,9 +126,6 @@ function(kernel_library TARGET) ...@@ -121,9 +126,6 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc) list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
list(APPEND selected_rows_srcs ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
endif()
if (WITH_GPU OR WITH_ROCM) if (WITH_GPU OR WITH_ROCM)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
...@@ -169,26 +171,46 @@ function(kernel_library TARGET) ...@@ -169,26 +171,46 @@ function(kernel_library TARGET)
list(APPEND all_srcs ${xpu_srcs}) list(APPEND all_srcs ${xpu_srcs})
list(APPEND all_srcs ${gpudnn_srcs}) list(APPEND all_srcs ${gpudnn_srcs})
list(APPEND all_srcs ${kps_srcs}) list(APPEND all_srcs ${kps_srcs})
set(all_include_kernels)
set(all_kernel_name)
foreach(src ${all_srcs}) foreach(src ${all_srcs})
file(READ ${src} target_content) file(READ ${src} target_content)
# "kernels/xxx"(DenseTensor Kernel) can only include each other, but can't include "SUB_DIR/xxx" (such as selected_rows Kernel)
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
if ("${kernel_library_SUB_DIR}" STREQUAL "") list(APPEND all_include_kernels ${include_kernels})
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
else() # "SUB_DIR/xxx" can include "kernels/xx" and "SUB_DIR/xxx"
if (NOT "${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
list(APPEND all_include_kernels ${include_kernels})
endif() endif()
foreach(include_kernel ${include_kernels})
foreach(include_kernel ${all_include_kernels})
if ("${kernel_library_SUB_DIR}" STREQUAL "") if ("${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
else() else()
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) # NOTE(dev): we should firstly match kernel_library_SUB_DIR.
if (${include_kernel} MATCHES "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/")
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
# for selected_rows directory, add ${target_suffix}.
string(REGEX REPLACE ".h\"" "${target_suffix}" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
else()
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
endif()
message(STATUS "${TARGET} DEPS ${all_kernel_name}")
endif() endif()
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) list(APPEND kernel_deps ${all_kernel_name})
list(APPEND kernel_deps ${kernel_name})
endforeach() endforeach()
endforeach() endforeach()
list(REMOVE_DUPLICATES kernel_deps) list(REMOVE_DUPLICATES kernel_deps)
list(REMOVE_ITEM kernel_deps ${TARGET}) list(REMOVE_ITEM kernel_deps ${TARGET}${target_suffix})
list(LENGTH common_srcs common_srcs_len) list(LENGTH common_srcs common_srcs_len)
list(LENGTH cpu_srcs cpu_srcs_len) list(LENGTH cpu_srcs cpu_srcs_len)
...@@ -196,92 +218,73 @@ function(kernel_library TARGET) ...@@ -196,92 +218,73 @@ function(kernel_library TARGET)
list(LENGTH xpu_srcs xpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH gpudnn_srcs gpudnn_srcs_len) list(LENGTH gpudnn_srcs gpudnn_srcs_len)
list(LENGTH kps_srcs kps_srcs_len) list(LENGTH kps_srcs kps_srcs_len)
list(LENGTH selected_rows_srcs selected_rows_srcs_len)
# kernel source file level # kernel source file level
# level 1: base device kernel # level 1: base device kernel
# - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs
# level 2: device-independent kernel # level 2: device-independent kernel
# - common_srcs # - common_srcs
# level 3: Kernel implemented by reusing device-independent kernel
# - selected_rows_srcs
set(base_device_kernels) set(base_device_kernels)
set(device_independent_kernel) set(device_independent_kernel)
set(high_level_kernels)
# 1. Base device kernel compile # 1. Base device kernel compile
if (${cpu_srcs_len} GREATER 0) if (${cpu_srcs_len} GREATER 0)
cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET}_cpu${target_suffix} SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_cpu) list(APPEND base_device_kernels ${TARGET}_cpu${target_suffix})
endif() endif()
if (${gpu_srcs_len} GREATER 0) if (${gpu_srcs_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) nv_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif() endif()
list(APPEND base_device_kernels ${TARGET}_gpu) list(APPEND base_device_kernels ${TARGET}_gpu${target_suffix})
endif() endif()
if (${xpu_srcs_len} GREATER 0) if (${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET}_xpu${target_suffix} SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_xpu) list(APPEND base_device_kernels ${TARGET}_xpu${target_suffix})
endif() endif()
if (${gpudnn_srcs_len} GREATER 0) if (${gpudnn_srcs_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) nv_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif() endif()
list(APPEND base_device_kernels ${TARGET}_gpudnn) list(APPEND base_device_kernels ${TARGET}_gpudnn${target_suffix})
endif() endif()
if (${kps_srcs_len} GREATER 0) if (${kps_srcs_len} GREATER 0)
# only when WITH_XPU_KP, the kps_srcs_len can be > 0 # only when WITH_XPU_KP, the kps_srcs_len can be > 0
xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) xpu_library(${TARGET}_kps${target_suffix} SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_kps) list(APPEND base_device_kernels ${TARGET}_kps${target_suffix})
endif() endif()
# 2. Device-independent kernel compile # 2. Device-independent kernel compile
if (${common_srcs_len} GREATER 0) if (${common_srcs_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) nv_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) hip_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_XPU_KP) elseif (WITH_XPU_KP)
xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) xpu_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
else() else()
cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) cc_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
endif() endif()
list(APPEND device_independent_kernel ${TARGET}_common) list(APPEND device_independent_kernel ${TARGET}_common${target_suffix})
endif() endif()
# 3. Reusing kernel compile
if (${selected_rows_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_ROCM)
hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
else()
cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
endif()
list(APPEND high_level_kernels ${TARGET}_sr)
endif()
# 4. Unify target compile # 3. Unify target compile
list(LENGTH base_device_kernels base_device_kernels_len) list(LENGTH base_device_kernels base_device_kernels_len)
list(LENGTH device_independent_kernel device_independent_kernel_len) list(LENGTH device_independent_kernel device_independent_kernel_len)
list(LENGTH high_level_kernels high_level_kernels_len) if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0)
if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR
${high_level_kernels_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) nv_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) hip_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_XPU_KP) elseif (WITH_XPU_KP)
xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) xpu_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
else() else()
cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) cc_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
endif() endif()
else() else()
set(target_build_flag 0) set(target_build_flag 0)
...@@ -290,10 +293,10 @@ function(kernel_library TARGET) ...@@ -290,10 +293,10 @@ function(kernel_library TARGET)
if (${target_build_flag} EQUAL 1) if (${target_build_flag} EQUAL 1)
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) ${gpudnn_srcs_len} GREATER 0)
# append target into PHI_KERNELS property # append target into PHI_KERNELS property
get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
set(phi_kernels ${phi_kernels} ${TARGET}) set(phi_kernels ${phi_kernels} ${TARGET}${target_suffix})
set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels}) set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels})
endif() endif()
...@@ -318,9 +321,6 @@ function(kernel_library TARGET) ...@@ -318,9 +321,6 @@ function(kernel_library TARGET)
if (${kps_srcs_len} GREATER 0) if (${kps_srcs_len} GREATER 0)
kernel_declare(${kps_srcs}) kernel_declare(${kps_srcs})
endif() endif()
if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs})
endif()
endif() endif()
endfunction() endfunction()
......
...@@ -219,13 +219,13 @@ message GraphParameter { ...@@ -219,13 +219,13 @@ message GraphParameter {
optional string gpups_graph_sample_class = 3 optional string gpups_graph_sample_class = 3
[ default = "CompleteGraphSampler" ]; [ default = "CompleteGraphSampler" ];
optional string gpups_graph_sample_args = 4 [ default = "" ]; optional string gpups_graph_sample_args = 4 [ default = "" ];
optional bool use_cache = 5 [ default = true ]; optional bool use_cache = 5 [ default = false ];
optional float cache_ratio = 6 [ default = 0.3 ]; optional int32 cache_size_limit = 6 [ default = 100000 ];
optional int32 cache_ttl = 7 [ default = 5 ]; optional int32 cache_ttl = 7 [ default = 5 ];
optional GraphFeature graph_feature = 8; optional GraphFeature graph_feature = 8;
optional string table_name = 9 [ default = "" ]; optional string table_name = 9 [ default = "" ];
optional string table_type = 10 [ default = "" ]; optional string table_type = 10 [ default = "" ];
optional int32 gpups_mode_shard_num = 11 [ default = 127 ]; optional int32 shard_num = 11 [ default = 127 ];
optional int32 gpu_num = 12 [ default = 1 ]; optional int32 gpu_num = 12 [ default = 1 ];
} }
......
...@@ -138,7 +138,6 @@ int BasicBfsGraphSampler::run_graph_sampling() { ...@@ -138,7 +138,6 @@ int BasicBfsGraphSampler::run_graph_sampling() {
int init_size = 0; int init_size = 0;
//__sync_fetch_and_add //__sync_fetch_and_add
std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int { std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int {
VLOG(0) << "in bfs " << i << " " << id;
if (this->status == GraphSamplerStatus::terminating) { if (this->status == GraphSamplerStatus::terminating) {
int task_left = __sync_sub_and_fetch(&task_size, 1); int task_left = __sync_sub_and_fetch(&task_size, 1);
if (task_left == 0) { if (task_left == 0) {
...@@ -148,13 +147,13 @@ int BasicBfsGraphSampler::run_graph_sampling() { ...@@ -148,13 +147,13 @@ int BasicBfsGraphSampler::run_graph_sampling() {
} }
size_t ind = i % this->graph_table->task_pool_size_; size_t ind = i % this->graph_table->task_pool_size_;
if (nodes_left[i] > 0) { if (nodes_left[i] > 0) {
nodes_left[i]--;
auto iter = sample_neighbors_map[ind].find(id); auto iter = sample_neighbors_map[ind].find(id);
if (iter == sample_neighbors_map[ind].end()) { if (iter == sample_neighbors_map[ind].end()) {
sample_neighbors_map[ind][id] = std::vector<int64_t>();
iter = sample_neighbors_map[ind].find(id);
Node *node = graph_table->shards[i]->find_node(id); Node *node = graph_table->shards[i]->find_node(id);
if (node != NULL) { if (node != NULL) {
nodes_left[i]--;
sample_neighbors_map[ind][id] = std::vector<int64_t>();
iter = sample_neighbors_map[ind].find(id);
size_t edge_fetch_size = size_t edge_fetch_size =
std::min((size_t) this->edge_num_for_each_node, std::min((size_t) this->edge_num_for_each_node,
node->get_neighbor_size()); node->get_neighbor_size());
...@@ -179,11 +178,14 @@ int BasicBfsGraphSampler::run_graph_sampling() { ...@@ -179,11 +178,14 @@ int BasicBfsGraphSampler::run_graph_sampling() {
for (size_t i = 0; i < graph_table->shards.size(); ++i) { for (size_t i = 0; i < graph_table->shards.size(); ++i) {
std::vector<Node *> &v = graph_table->shards[i]->get_bucket(); std::vector<Node *> &v = graph_table->shards[i]->get_bucket();
if (v.size() > 0) { if (v.size() > 0) {
init_size++; int search_size = std::min(init_search_size, (int)v.size());
__sync_add_and_fetch(&task_size, 1); for (int k = 0; k < search_size; k++) {
int64_t id = v[0]->get_id(); init_size++;
graph_table->_shards_task_pool[i % graph_table->task_pool_size_] __sync_add_and_fetch(&task_size, 1);
->enqueue(bfs, i, id); int64_t id = v[k]->get_id();
graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
->enqueue(bfs, i, id);
}
} // if } // if
} }
if (init_size == 0) { if (init_size == 0) {
...@@ -301,10 +303,11 @@ void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table, ...@@ -301,10 +303,11 @@ void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
std::vector<std::string> args) { std::vector<std::string> args) {
this->gpu_num = gpu_num; this->gpu_num = gpu_num;
this->graph_table = graph_table; this->graph_table = graph_table;
node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10; init_search_size = args.size() > 0 ? std::stoi(args[0]) : 10;
edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10; node_num_for_each_shard = args.size() > 1 ? std::stoi(args[1]) : 10;
rounds = args.size() > 2 ? std::stoi(args[2]) : 1; edge_num_for_each_node = args.size() > 2 ? std::stoi(args[2]) : 10;
interval = args.size() > 3 ? std::stoi(args[3]) : 60; rounds = args.size() > 3 ? std::stoi(args[3]) : 1;
interval = args.size() > 4 ? std::stoi(args[4]) : 60;
} }
#endif #endif
...@@ -1092,11 +1095,6 @@ int32_t GraphTable::initialize(const GraphParameter &graph) { ...@@ -1092,11 +1095,6 @@ int32_t GraphTable::initialize(const GraphParameter &graph) {
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
if (graph.gpups_mode()) { if (graph.gpups_mode()) {
gpups_mode = true; gpups_mode = true;
if (shard_num == 0) {
shard_num = graph.gpups_mode_shard_num();
server_num = 1;
_shard_idx = 0;
}
auto *sampler = auto *sampler =
CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class()); CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class());
auto slices = auto slices =
...@@ -1107,7 +1105,18 @@ int32_t GraphTable::initialize(const GraphParameter &graph) { ...@@ -1107,7 +1105,18 @@ int32_t GraphTable::initialize(const GraphParameter &graph) {
graph_sampler.reset(sampler); graph_sampler.reset(sampler);
} }
#endif #endif
if (shard_num == 0) {
server_num = 1;
_shard_idx = 0;
shard_num = graph.shard_num();
}
task_pool_size_ = graph.task_pool_size(); task_pool_size_ = graph.task_pool_size();
use_cache = graph.use_cache();
if (use_cache) {
cache_size_limit = graph.cache_size_limit();
cache_ttl = graph.cache_ttl();
make_neighbor_sample_cache((size_t)cache_size_limit, (size_t)cache_ttl);
}
_shards_task_pool.resize(task_pool_size_); _shards_task_pool.resize(task_pool_size_);
for (size_t i = 0; i < _shards_task_pool.size(); ++i) { for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
_shards_task_pool[i].reset(new ::ThreadPool(1)); _shards_task_pool[i].reset(new ::ThreadPool(1));
......
...@@ -547,6 +547,8 @@ class GraphTable : public SparseTable { ...@@ -547,6 +547,8 @@ class GraphTable : public SparseTable {
std::unordered_set<int64_t> extra_nodes; std::unordered_set<int64_t> extra_nodes;
std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index; std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
bool use_cache, use_duplicate_nodes; bool use_cache, use_duplicate_nodes;
int cache_size_limit;
int cache_ttl;
mutable std::mutex mutex_; mutable std::mutex mutex_;
std::shared_ptr<pthread_rwlock_t> rw_lock; std::shared_ptr<pthread_rwlock_t> rw_lock;
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
...@@ -593,7 +595,7 @@ class BasicBfsGraphSampler : public GraphSampler { ...@@ -593,7 +595,7 @@ class BasicBfsGraphSampler : public GraphSampler {
std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes; std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
std::vector<std::vector<int64_t>> sample_neighbors; std::vector<std::vector<int64_t>> sample_neighbors;
size_t gpu_num; size_t gpu_num;
int node_num_for_each_shard, edge_num_for_each_node; int init_search_size, node_num_for_each_shard, edge_num_for_each_node;
int rounds, interval; int rounds, interval;
std::vector<std::unordered_map<int64_t, std::vector<int64_t>>> std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
sample_neighbors_map; sample_neighbors_map;
......
...@@ -456,7 +456,7 @@ void RunBrpcPushSparse() { ...@@ -456,7 +456,7 @@ void RunBrpcPushSparse() {
pull_status.wait(); pull_status.wait();
ASSERT_EQ(_vs[0].size(), vs1[0].size()); ASSERT_EQ(_vs[0].size(), vs1[0].size());
for (int j = 0; j < _vs[0].size(); j++) { for (size_t j = 0; j < _vs[0].size(); j++) {
ASSERT_EQ(_vs[0][j], vs1[0][j]); ASSERT_EQ(_vs[0][j], vs1[0][j]);
} }
} }
......
...@@ -86,7 +86,7 @@ void testGraphSample() { ...@@ -86,7 +86,7 @@ void testGraphSample() {
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
::paddle::distributed::GraphParameter table_proto; ::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(true); table_proto.set_gpups_mode(true);
table_proto.set_gpups_mode_shard_num(127); table_proto.set_shard_num(127);
table_proto.set_gpu_num(2); table_proto.set_gpu_num(2);
distributed::GraphTable graph_table, graph_table1; distributed::GraphTable graph_table, graph_table1;
...@@ -113,7 +113,7 @@ void testGraphSample() { ...@@ -113,7 +113,7 @@ void testGraphSample() {
::paddle::distributed::GraphParameter table_proto1; ::paddle::distributed::GraphParameter table_proto1;
table_proto1.set_gpups_mode(true); table_proto1.set_gpups_mode(true);
table_proto1.set_gpups_mode_shard_num(127); table_proto1.set_shard_num(127);
table_proto1.set_gpu_num(2); table_proto1.set_gpu_num(2);
table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler"); table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler");
table_proto1.set_gpups_graph_sample_args("5,5,1,1"); table_proto1.set_gpups_graph_sample_args("5,5,1,1");
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import yaml
import re
import argparse
import os
########################
### Global Variables ###
########################
ops_to_fill_zero_for_empty_grads = set(list("split"))
# For API dispatch used at python-level
# { op_name : [arg_name, ...] }
core_ops_returns_info = {}
core_ops_args_info = {}
core_ops_args_type_info = {}
yaml_types_mapping = {
'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \
'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
'str' : 'std::string', \
'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
'Tensor' : 'Tensor',
'Tensor[]' : 'std::vector<Tensor>',
'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
'Scalar' : 'paddle::experimental::Scalar',
'ScalarArray' : 'paddle::experimental::ScalarArray'
}
#############################
### File Reader Helpers ###
#############################
def ReadFwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
f.close()
return contents
def ReadBwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
ret = {}
for content in contents:
if 'backward_api' in content.keys():
api_name = content['backward_api']
else:
assert False
ret[api_name] = content
f.close()
return ret
##################################
### Generic Helper Functions ###
##################################
def FindGradName(string):
return string + "_grad"
def FindForwardName(string):
if not string.endswith("_grad"):
return None
return string[:-5]
def IsPlainTensorType(string):
plain_tensor_types = ['Tensor&', 'Tensor', 'const Tensor&', 'const Tensor']
if string in plain_tensor_types:
return True
return False
def IsVectorTensorType(string):
vector_tensor_types = [
'std::vector<std::vector<Tensor>>', 'std::vector<Tensor>'
]
if string in vector_tensor_types:
return True
return False
def GetSavedName(string):
return string + "_"
def GetConstReference(string):
ret = string
if not string.startswith("const "):
ret = "const " + string
if not string.endswith("&"):
ret += "&"
return ret
def RemoveConstAndReference(string):
ret = string
if string.startswith("const "):
ret = ret[6:]
if string.endswith("&"):
ret = ret[:-1]
return ret
def GetGradNodeName(string):
return f"FinalGradNode{string}"
def GetDygraphForwardFunctionName(string):
return f"{string}_final_state_dygraph_function"
def GetIntermediateAPIFunctionName(string):
return string + "_intermediate"
def GetAutoGradMetaName(string):
return f"{string}_autograd_meta"
def GetAutoGradMetaVectorName(string):
return f"{string}_autograd_meta_vec"
def RemoveSpecialSymbolsInName(string):
# Remove any name after '@'
ret = string.split("@")[0]
return ret
def RecoverBaseNameOfInplaceFunction(function_name):
return function_name[:-1]
def GetInplacedFunctionName(function_name):
return function_name + "_"
def GetForwardFunctionName(string):
return f"{string}_final_state_dygraph_function"
######################
### Yaml Parsers ###
######################
def ParseYamlArgs(string):
# Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y
# inputs_list = [ [arg_name, arg_type, orig_position], ...]
inputs_list = []
# attrs_list = [ [arg_name, arg_type, default_value, orig_position], ...]
attrs_list = []
args = [x.strip() for x in string.strip().split(",")]
atype = r'((const )?\S+) '
aname = r'(.*)'
pattern = f'{atype}{aname}'
for i in range(len(args)):
arg = args[i]
m = re.search(pattern, arg)
arg_type = m.group(1).strip()
arg_name = m.group(3).split("=")[0].strip()
default_value = m.group(3).split("=")[1].strip() if len(
m.group(3).split("=")) > 1 else None
assert arg_type in yaml_types_mapping.keys(
), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
arg_type = yaml_types_mapping[arg_type]
arg_name = RemoveSpecialSymbolsInName(arg_name)
if "Tensor" in arg_type:
assert default_value is None
inputs_list.append([arg_name, arg_type, i])
else:
attrs_list.append([arg_name, arg_type, default_value, i])
return inputs_list, attrs_list
def ParseYamlReturns(string):
# Example0: Tensor(out), Tensor(out1)
# Example1: Tensor, Tensor
# Example2: Tensor[](out), Tensor
# list = [ [ret_name, ret_type, orig_position], ...]
returns_list = []
returns = [x.strip() for x in string.strip().split(",")]
for i in range(len(returns)):
ret = returns[i]
ret_name = ""
if "(" in ret and ")" in ret:
# Remove trailing ')'
ret = ret[:-1]
ret_type = ret.split("(")[0].strip()
ret_name = ret.split("(")[1].strip()
else:
ret_type = ret.strip()
assert ret_type in yaml_types_mapping.keys(
), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
ret_type = yaml_types_mapping[ret_type]
assert "Tensor" in ret_type
ret_name = RemoveSpecialSymbolsInName(ret_name)
returns_list.append([ret_name, ret_type, i])
return returns_list
def ParseYamlForwardFromBackward(string):
# Example: matmul (const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y) -> Tensor(out)
fname = r'(.*?)'
wspace = r'\s*'
fargs = r'(.*?)'
frets = r'(.*)'
pattern = f'{fname}{wspace}\({wspace}{fargs}{wspace}\){wspace}->{wspace}{frets}'
m = re.search(pattern, string)
function_name = m.group(1)
function_args = m.group(2)
function_returns = m.group(3)
forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args)
forward_returns_list = ParseYamlReturns(function_returns)
return forward_inputs_list, forward_attrs_list, forward_returns_list
def ParseYamlForward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
# returns Example: Tensor, Tensor
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
return inputs_list, attrs_list, returns_list
def ParseYamlBackward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false)
# returns Example: Tensor(x_grad), Tensor(y_grad)
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
return inputs_list, attrs_list, returns_list
########################
### Generator Base ###
########################
class FunctionGeneratorBase:
def __init__(self, forward_api_contents, namespace):
self.forward_api_contents = forward_api_contents
self.namespace = namespace
self.forward_api_name = ""
self.orig_forward_inputs_list = [
] #[ [arg_name, arg_type, orig_position], ...]
self.orig_forward_attrs_list = [
] #[ [attr_name, attr_type, default_value, orig_position], ...]
self.orig_forward_returns_list = [
] #[ [ret_name, ret_type, orig_position], ...]
# Processed Forward Data
self.forward_inputs_position_map = {
} #{ "name" : [type, fwd_position] }
self.forward_outputs_position_map = {
} #{ "name" : [type, fwd_position] }
# Special Op Attributes
self.optional_inputs = [] #[name, ...]
self.no_need_buffers = [] #[name, ...]
self.intermediate_outputs = [] #[name, ...]
self.inplace_map = {} #{name : name, ...}
def ParseInplaceInfo(self):
forward_api_contents = self.forward_api_contents
if 'inplace' not in forward_api_contents.keys(): return
# inplace_map_str: "(x -> out0), (y -> out2)"
inplace_map_str = forward_api_contents['inplace']
for pair in inplace_map_str.split(","):
pair = pair.strip()
if pair.startswith("("):
pair = pair[1:]
if pair.endswith(")"):
pair = pair[:-1]
key = pair.split("->")[0].strip()
val = pair.split("->")[1].strip()
self.inplace_map[key] = val
def ParseNoNeedBuffer(self):
forward_api_contents = self.forward_api_contents
if 'no_need_buffer' in forward_api_contents.keys():
no_need_buffer_str = forward_api_contents['no_need_buffer']
for name in no_need_buffer_str.split(","):
name = name.strip()
name = RemoveSpecialSymbolsInName(name)
self.no_need_buffers.append(name.strip())
def ParseDispensable(self):
forward_api_contents = self.forward_api_contents
if 'optional' in forward_api_contents.keys():
optional_inputs_str = forward_api_contents['optional']
for name in optional_inputs_str.split(","):
name = name.strip()
name = RemoveSpecialSymbolsInName(name)
self.optional_inputs.append(name)
def ParseIntermediate(self):
forward_api_contents = self.forward_api_contents
if 'intermediate' in forward_api_contents.keys():
intermediate_str = forward_api_contents['intermediate']
for name in intermediate_str.split(","):
name = name.strip()
name = RemoveSpecialSymbolsInName(name)
self.intermediate_outputs.append(name)
def CollectOriginalForwardInfo(self):
forward_api_contents = self.forward_api_contents
self.forward_api_name = forward_api_contents['api']
forward_args_str = forward_api_contents['args']
forward_returns_str = forward_api_contents['output']
assert 'api' in forward_api_contents.keys(
), "Unable to find \"api\" in forward_api_contents keys"
assert 'args' in forward_api_contents.keys(
), "Unable to find \"args\" in forward_api_contents keys"
assert 'output' in forward_api_contents.keys(
), "Unable to find \"output\" in forward_api_contents keys"
# Collect Original Forward Inputs/Outputs and then perform validation checks
self.orig_forward_inputs_list, self.orig_forward_attrs_list, self.orig_forward_returns_list = ParseYamlForward(
forward_args_str, forward_returns_str)
def DetermineForwardPositionMap(self, forward_inputs_list,
forward_returns_list):
for i in range(len(forward_inputs_list)):
forward_input = forward_inputs_list[i]
input_name = forward_input[0]
input_type = forward_input[1]
input_pos = forward_input[2]
self.forward_inputs_position_map[
input_name] = [input_type, input_pos]
for i in range(len(forward_returns_list)):
forward_return = forward_returns_list[i]
return_name = forward_return[0]
return_type = forward_return[1]
return_pos = forward_return[2]
self.forward_outputs_position_map[
return_name] = [return_type, return_pos]
print("Generated Forward Input Position Map: ",
self.forward_inputs_position_map)
print("Generated Forward Output Position Map: ",
self.forward_outputs_position_map)
class YamlGeneratorBase:
def __init__(self, api_yaml_path):
self.namespace = ""
self.api_yaml_path = api_yaml_path
self.forward_api_list = []
def ParseForwardYamlContents(self):
api_yaml_path = self.api_yaml_path
self.forward_api_list = ReadFwdFile(api_yaml_path)
def InferNameSpace(self):
api_yaml_path = self.api_yaml_path
if "sparse" in api_yaml_path:
self.namespace = "sparse::"
...@@ -16,31 +16,25 @@ import yaml ...@@ -16,31 +16,25 @@ import yaml
import re import re
import argparse import argparse
import os import os
from codegen_utils import core_ops_returns_info, core_ops_args_info, core_ops_args_type_info
ops_to_fill_zero_for_empty_grads = set(list("split")) from codegen_utils import yaml_types_mapping
from codegen_utils import ReadFwdFile, ReadBwdFile
# For API dispatch used at python-level from codegen_utils import FindGradName, FindForwardName, GetSavedName, GetGradNodeName
# { op_name : [arg_name, ...] } from codegen_utils import IsPlainTensorType, IsVectorTensorType
core_ops_returns_info = {} from codegen_utils import GetConstReference, RemoveConstAndReference
core_ops_args_info = {} from codegen_utils import GetDygraphForwardFunctionName, GetIntermediateAPIFunctionName
core_ops_args_type_info = {} from codegen_utils import GetAutoGradMetaName, GetAutoGradMetaVectorName
from codegen_utils import RemoveSpecialSymbolsInName, RecoverBaseNameOfInplaceFunction
namespace = "" from codegen_utils import GetInplacedFunctionName
from codegen_utils import ParseYamlArgs, ParseYamlReturns, ParseYamlForwardFromBackward
yaml_types_mapping = { from codegen_utils import ParseYamlForward, ParseYamlBackward
'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ from codegen_utils import FunctionGeneratorBase, YamlGeneratorBase
'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ from codegen_utils import ops_to_fill_zero_for_empty_grads
'str' : 'std::string', \
'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>', ###########
'Tensor' : 'Tensor', ## Utils ##
'Tensor[]' : 'std::vector<Tensor>', ###########
'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
'Scalar' : 'paddle::experimental::Scalar',
'ScalarArray' : 'paddle::experimental::ScalarArray'
}
def ParseArguments(): def ParseArguments():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Eager Code Generator Args Parser') description='Eager Code Generator Args Parser')
...@@ -55,845 +49,129 @@ def ParseArguments(): ...@@ -55,845 +49,129 @@ def ParseArguments():
return args return args
################# ########################
### Helpers ### ## Code Gen Templates ##
################# ########################
def RecoverBaseNameOfInplaceFunction(function_name): SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = \
return function_name[:-1] """
def GetInplacedFunctionName(function_name):
return function_name + "_"
def FindGradName(string):
return string + "_grad"
def FindForwardName(string):
if not string.endswith("_grad"):
return None
return string[:-5]
def IsPlainTensorType(string):
plain_tensor_types = ['Tensor&', 'Tensor', 'const Tensor&', 'const Tensor']
if string in plain_tensor_types:
return True
return False
def IsVectorTensorType(string):
vector_tensor_types = [
'std::vector<std::vector<Tensor>>', 'std::vector<Tensor>'
]
if string in vector_tensor_types:
return True
return False
def GetSavedName(string):
return string + "_"
def GetConstReference(string):
ret = string
if not string.startswith("const "):
ret = "const " + string
if not string.endswith("&"):
ret += "&"
return ret
def RemoveConstAndReference(string):
ret = string
if string.startswith("const "):
ret = ret[6:]
if string.endswith("&"):
ret = ret[:-1]
return ret
def GetGradNodeName(string):
return f"FinalGradNode{string}"
def GetForwardFunctionName(string):
return f"{string}_final_state_dygraph_function"
def GetAutoGradMetaName(string):
return f"{string}_autograd_meta"
def GetAutoGradMetaVectorName(string):
return f"{string}_autograd_meta_vec"
######################
### File Readers ###
######################
def ReadFwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
f.close()
return contents
def ReadBwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
ret = {}
for content in contents:
if 'backward_api' in content.keys():
api_name = content['backward_api']
else:
assert False
ret[api_name] = content
f.close()
return ret
######################
### Yaml Parsers ###
######################
def ParseInplaceInfo(string):
# string: "(x -> out0), (y -> out2)"
inplace_map = {}
for pair in string.split(","):
pair = pair.strip()
if pair.startswith("("):
pair = pair[1:]
if pair.endswith(")"):
pair = pair[:-1]
key = pair.split("->")[0].strip()
val = pair.split("->")[1].strip()
inplace_map[key] = val
return inplace_map
def RemoveSpecialSymbolsInName(string):
# Remove any name after '@'
ret = string.split("@")[0]
return ret
def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
# intermediate_outputs : [name0, name1, ...]
# forward_returns_list : [[ret_name, type, orig_pos], ...]
"""
Check whether intermediate_outputs are positioned
at the very end of forward_returns_list
"""
intermediate_positions = range(
len(forward_returns_list) - len(intermediate_outputs),
len(forward_returns_list))
for ret_name, _, pos in forward_returns_list:
if ret_name in intermediate_outputs:
assert pos in intermediate_positions
def ParseDispensable(string):
# string: "X, Y"
string = RemoveSpecialSymbolsInName(string)
return [v.strip() for v in string.split(",")]
def ParseIntermediate(string):
string = RemoveSpecialSymbolsInName(string)
return [v.strip() for v in string.split(",")]
def ParseNoNeedBuffer(string):
# string: "x, y"
string = RemoveSpecialSymbolsInName(string)
no_need_buffer_set = set()
for name in string.split(","):
no_need_buffer_set.add(name.strip())
return no_need_buffer_set
def ParseYamlArgs(string):
# Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y
# inputs_list = [ [arg_name, arg_type, orig_position], ...]
inputs_list = []
# attrs_list = [ [arg_name, arg_type, default_value, orig_position], ...]
attrs_list = []
args = [x.strip() for x in string.strip().split(",")]
atype = r'((const )?\S+) '
aname = r'(.*)'
pattern = f'{atype}{aname}'
for i in range(len(args)):
arg = args[i]
m = re.search(pattern, arg)
arg_type = m.group(1).strip()
arg_name = m.group(3).split("=")[0].strip()
default_value = m.group(3).split("=")[1].strip() if len(
m.group(3).split("=")) > 1 else None
assert arg_type in yaml_types_mapping.keys(
), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
arg_type = yaml_types_mapping[arg_type]
arg_name = RemoveSpecialSymbolsInName(arg_name)
if "Tensor" in arg_type:
assert default_value is None
inputs_list.append([arg_name, arg_type, i])
else:
attrs_list.append([arg_name, arg_type, default_value, i])
return inputs_list, attrs_list
def ParseYamlReturns(string):
# Example0: Tensor(out), Tensor(out1)
# Example1: Tensor, Tensor
# Example2: Tensor[](out), Tensor
# list = [ [ret_name, ret_type, orig_position], ...]
returns_list = []
returns = [x.strip() for x in string.strip().split(",")]
for i in range(len(returns)):
ret = returns[i]
ret_name = ""
if "(" in ret and ")" in ret:
# Remove trailing ')'
ret = ret[:-1]
ret_type = ret.split("(")[0].strip()
ret_name = ret.split("(")[1].strip()
else:
ret_type = ret.strip()
assert ret_type in yaml_types_mapping.keys(
), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
ret_type = yaml_types_mapping[ret_type]
assert "Tensor" in ret_type
ret_name = RemoveSpecialSymbolsInName(ret_name)
returns_list.append([ret_name, ret_type, i])
return returns_list
def ParseYamlForwardFromBackward(string):
# Example: matmul (const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y) -> Tensor(out)
fname = r'(.*?)'
wspace = r'\s*'
fargs = r'(.*?)'
frets = r'(.*)'
pattern = f'{fname}{wspace}\({wspace}{fargs}{wspace}\){wspace}->{wspace}{frets}'
m = re.search(pattern, string)
function_name = m.group(1)
function_args = m.group(2)
function_returns = m.group(3)
forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args)
forward_returns_list = ParseYamlReturns(function_returns)
return forward_inputs_list, forward_attrs_list, forward_returns_list
def ParseYamlForward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
# returns Example: Tensor, Tensor
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
return inputs_list, attrs_list, returns_list
def ParseYamlBackward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false)
# returns Example: Tensor(x_grad), Tensor(y_grad)
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
return inputs_list, attrs_list, returns_list
#######################
### Preprocessing ###
#######################
def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
forward_returns_list, orig_forward_inputs_list,
orig_forward_attrs_list, orig_forward_returns_list):
for i in range(len(forward_inputs_list)):
forward_input_name = forward_inputs_list[i][0]
forward_input_type = forward_inputs_list[i][1]
forward_input_pos = forward_inputs_list[i][2]
orig_input_name = orig_forward_inputs_list[i][0]
orig_input_type = orig_forward_inputs_list[i][1]
orig_input_pos = orig_forward_inputs_list[i][2]
assert forward_input_type == orig_input_type
assert forward_input_pos == orig_input_pos
for i in range(len(forward_attrs_list)):
orig_attr_name = orig_forward_attrs_list[i][0]
orig_attr_type = orig_forward_attrs_list[i][1]
orig_attr_default = orig_forward_attrs_list[i][2]
orig_attr_pos = orig_forward_attrs_list[i][3]
forward_attr_name = forward_attrs_list[i][0]
forward_attr_type = forward_attrs_list[i][1]
forward_attr_default = forward_attrs_list[i][2]
forward_attr_pos = forward_attrs_list[i][3]
assert orig_attr_type == forward_attr_type
assert orig_attr_default == forward_attr_default
assert orig_attr_pos == forward_attr_pos
for i in range(len(forward_returns_list)):
orig_return_type = orig_forward_returns_list[i][1]
orig_return_pos = orig_forward_returns_list[i][2]
forward_return_type = forward_returns_list[i][1]
forward_return_pos = forward_returns_list[i][2]
assert orig_return_type == forward_return_type
assert orig_return_pos == forward_return_pos
# Check Order: Inputs, Attributes
max_input_position = -1
for _, _, pos in forward_inputs_list:
max_input_position = max(max_input_position, pos)
max_attr_position = -1
for _, _, _, pos in forward_attrs_list:
assert pos > max_input_position
max_attr_position = max(max_attr_position, pos)
def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
backward_attrs_list):
# Check Order: TensorWrappers, GradTensors, Attributes
max_fwd_input_position = -1
for _, (_, _, pos) in backward_fwd_input_map.items():
max_fwd_input_position = max(max_fwd_input_position, pos)
max_grad_tensor_position = -1
for _, (_, _, pos) in backward_grad_input_map.items():
assert pos > max_fwd_input_position
max_grad_tensor_position = max(max_grad_tensor_position, pos)
max_attr_position = -1
for _, _, _, pos in backward_attrs_list:
assert pos > max_grad_tensor_position
max_attr_position = max(max_attr_position, pos)
def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list):
forward_inputs_position_map = {}
forward_outputs_position_map = {}
for i in range(len(forward_inputs_list)):
forward_input = forward_inputs_list[i]
input_name = forward_input[0]
input_type = forward_input[1]
input_pos = forward_input[2]
forward_inputs_position_map[input_name] = [input_type, input_pos]
for i in range(len(forward_returns_list)):
forward_return = forward_returns_list[i]
return_name = forward_return[0]
return_type = forward_return[1]
return_pos = forward_return[2]
forward_outputs_position_map[return_name] = [return_type, return_pos]
return forward_inputs_position_map, forward_outputs_position_map
def SlotNameMatching(backward_inputs_list, backward_returns_list,
forward_inputs_position_map, forward_outputs_position_map):
backward_fwd_input_map = {}
backward_grad_input_map = {}
backward_grad_output_map = {}
for backward_input in backward_inputs_list:
backward_input_name = backward_input[0]
backward_input_type = backward_input[1]
backward_input_pos = backward_input[2]
backward_fwd_name = FindForwardName(backward_input_name)
if backward_fwd_name:
# Grad Input
assert backward_fwd_name in forward_outputs_position_map.keys()
matched_forward_output_type = forward_outputs_position_map[
backward_fwd_name][0]
matched_forward_output_pos = forward_outputs_position_map[
backward_fwd_name][1]
backward_grad_input_map[backward_input_name] = [
backward_input_type, matched_forward_output_pos,
backward_input_pos
]
else:
# TensorWrapper Input
if backward_input_name in forward_inputs_position_map.keys():
tensor_wrapper_type = forward_inputs_position_map[
backward_input_name][0]
backward_fwd_input_map[backward_input_name] = [
backward_input_type, True, backward_input_pos
]
elif backward_input_name in forward_outputs_position_map.keys():
tensor_wrapper_type = forward_outputs_position_map[
backward_input_name][0]
backward_fwd_input_map[backward_input_name] = [
backward_input_type, False, backward_input_pos
]
else:
assert False, backward_input_name
for backward_output in backward_returns_list:
backward_output_name = backward_output[0]
backward_output_type = backward_output[1]
backward_output_pos = backward_output[2]
backward_fwd_name = FindForwardName(backward_output_name)
assert backward_fwd_name is not None
assert backward_fwd_name in forward_inputs_position_map.keys(
), backward_fwd_name
matched_forward_input_type = forward_inputs_position_map[
backward_fwd_name][0]
matched_forward_input_pos = forward_inputs_position_map[
backward_fwd_name][1]
backward_grad_output_map[backward_output_name] = [
backward_output_type, matched_forward_input_pos, backward_output_pos
]
return backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map
def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
backward_attrs_list, no_need_buffer_set):
# Inputs:
# fwd_api_name = ""
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# Determine Node Name
forward_op_name = fwd_api_name
# SetTensorWrapper Methods & TensorWrapper Members
set_tensor_wrapper_methods_str = ""
tensor_wrapper_members_str = ""
clear_tensor_wrapper_str = ""
for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
if tname in no_need_buffer_set:
no_need_buffer = "true"
else:
no_need_buffer = "false"
tensor_wrapper_name = GetSavedName(tname)
if IsPlainTensorType(ttype):
SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """
void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{ void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{
{} = egr::TensorWrapper({}, full_reserved, {}); {} = egr::TensorWrapper({}, full_reserved, {});
}} }}
""" """
set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tensor_wrapper_name, tname, no_need_buffer)
PLAIN_TENSOR_MEMBER_TEMPLATE = """ PLAIN_TENSOR_MEMBER_TEMPLATE = \
egr::TensorWrapper {}; """
egr::TensorWrapper {};
""" """
tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ CLEAR_TENSOR_WRAPPER_TEMPLATE = \
{}.clear(); """
{}.clear();
""" """
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
tensor_wrapper_name)
else: SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = \
assert IsVectorTensorType(ttype) """
SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """ void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{ for(const auto& eager_tensor : {}) {{
for(const auto& eager_tensor : {}) {{ {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) );
{}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) ); }};
}}; }}
}}
""" """
set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tname, tensor_wrapper_name, no_need_buffer)
VECTOR_TENSOR_MEMBER_TEMPLATE = """ VECTOR_TENSOR_MEMBER_TEMPLATE = \
std::vector<egr::TensorWrapper> {}; """
std::vector<egr::TensorWrapper> {};
""" """
tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ CLEAR_VECTOR_TENSOR_WRAPPERS_TEMPLATE = \
for (auto tw: {}) {
tw.clear();
};
""" """
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( for (auto tw: {}) {
tensor_wrapper_name) tw.clear();
};
# End: SetTensorWrapper Methods & TensorWrapper Members
# SetAttributes & Attribute Members
set_attribute_methods_str = ""
attribute_members_str = ""
for aname, atype, default_val, _ in backward_attrs_list:
saved_attr_name = GetSavedName(aname)
SET_ATTR_METHOD_TEMPLATE = """
void SetAttribute{}({} {}) {{
{} = {};
}}
""" """
set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format(
aname, GetConstReference(atype), aname, saved_attr_name, aname)
if default_val: SET_ATTR_METHOD_TEMPLATE = \
ATTRIBUTE_MEMBER_TEMPLATE = """ """
void SetAttribute{}({} {}) {{
{} = {};
}}
"""
ATTRIBUTE_MEMBER_WITH_DEFAULT_TEMPLATE = \
"""
{} {} = {}; {} {} = {};
""" """
attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
RemoveConstAndReference(atype), saved_attr_name, default_val) ATTRIBUTE_MEMBER_TEMPLATE = \
else: """
ATTRIBUTE_MEMBER_TEMPLATE = """
{} {}; {} {};
"""
attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
RemoveConstAndReference(atype), saved_attr_name)
# End: SetAttributes & Attribute Members
grad_node_name = GetGradNodeName(fwd_api_name)
NODE_DECLARATION_TEMPLATE = """
class {} : public egr::GradNodeBase {{
public:
{}() : egr::GradNodeBase() {{}}
{}(size_t bwd_in_slot_num, size_t bwd_out_slot_num) :
egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {{}}
~{}() override = default;
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }}
void ClearTensorWrappers() override {{
{}
is_tensor_wrappers_cleared = true;
}}
// SetTensorWrapperX, SetTensorWrapperY, ...
{}
// SetAttributes
{}
bool IsTensorWrappersCleared() override {{
return is_tensor_wrappers_cleared;
}}
private:
// TensorWrappers
{}
bool is_tensor_wrappers_cleared = false;
// Attributes
{}
}};
""" """
node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_node_name, grad_node_name,
grad_node_name, clear_tensor_wrapper_str,
set_tensor_wrapper_methods_str, set_attribute_methods_str,
tensor_wrapper_members_str, attribute_members_str)
return node_declaration_str
def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map,
backward_attrs_list):
# fwd_api_name = ""
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# Construct grad_api function args
# Order: TensorWrappers, GradTensors, Attributes
grad_api_args_len = len(backward_fwd_input_map.keys()) + len(
backward_grad_input_map.keys()) + len(backward_attrs_list)
grad_api_args = ["" for i in range(grad_api_args_len)]
for name, (_, is_fwd_input,
grad_api_position), in backward_fwd_input_map.items():
tensor_wrapper_name = GetSavedName(name)
grad_api_args[
grad_api_position] = f"egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, nullptr)"
for _, (ttype, fwd_position,
grad_api_position) in backward_grad_input_map.items():
if IsPlainTensorType(ttype):
grad_api_args[
grad_api_position] = f"hooked_grads[{fwd_position}][0]"
else:
assert IsVectorTensorType(ttype)
grad_api_args[grad_api_position] = f"hooked_grads[{fwd_position}]"
for name, _, _, grad_api_position in backward_attrs_list:
saved_attribute_name = GetSavedName(name)
grad_api_args[grad_api_position] = f"this->{saved_attribute_name}"
grad_api_args_str = ", ".join(grad_api_args)
# Construct grad_api returns
num_bwd_outputs = len(backward_grad_output_map.keys())
returns_str = f"std::vector<std::vector<paddle::experimental::Tensor>> returns({num_bwd_outputs});\n"
for _, (ttype, fwd_position,
grad_api_position) in backward_grad_output_map.items():
# Infer Grad API Return Type
if num_bwd_outputs == 1:
# Single tensor output, return as is
if IsPlainTensorType(ttype):
returns_str += "returns[0] = { grad_api_returns };\n"
else:
assert IsVectorTensorType(ttype)
returns_str += "returns[0] = grad_api_returns;\n"
else:
# Rearrange output order accordingly
returns_str += f"returns[{fwd_position}] = grad_api_returns[{grad_api_position}];\n"
returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
returns_str += f"return returns;\n"
grad_node_name = GetGradNodeName(fwd_api_name) NODE_DECLARATION_TEMPLATE = \
"""
class {} : public egr::GradNodeBase {{
public:
{}() : egr::GradNodeBase() {{}}
{}(size_t bwd_in_slot_num, size_t bwd_out_slot_num) :
egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {{}}
~{}() override = default;
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }}
void ClearTensorWrappers() override {{
{}
is_tensor_wrappers_cleared = true;
}}
// SetTensorWrapperX, SetTensorWrapperY, ...
{}
// SetAttributes
{}
fill_zero_str = "" bool IsTensorWrappersCleared() override {{
if fwd_api_name in ops_to_fill_zero_for_empty_grads: return is_tensor_wrappers_cleared;
fill_zero_str = "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n" }}
private:
// TensorWrappers
{}
if len(namespace) > 0: bool is_tensor_wrappers_cleared = false;
grad_api_namespace = f"paddle::experimental::{namespace}"
else:
grad_api_namespace = f"paddle::experimental"
FUNCTION_TEMPLATE = """ // Attributes
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{ {}
{} }};
auto hooked_grads = ApplyGradientHooks(grads);
// Call grad_api function
VLOG(3) << \"Final State Running: \" << \"{}\";
auto grad_api_returns = {}::{}({});
{}
}}
"""
node_definition_str = FUNCTION_TEMPLATE.format(
grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace,
bwd_api_name, grad_api_args_str, returns_str)
return node_definition_str
def GenerateNodeCreationCodes(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list, forward_call_str,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
inplace_map):
# fwd_api_name = ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
# forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# Get Input AutoGradMeta
inputs_autograd_meta_list = []
compute_require_grad_args_list = ["trace_backward"]
for name, (ttype, pos) in forward_inputs_position_map.items():
input_autograd_meta_name = GetAutoGradMetaName(name)
if IsPlainTensorType(ttype):
input_autograd_meta = f" egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
else:
assert IsVectorTensorType(ttype)
input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
input_autograd_meta = f" std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
input_autograd_meta += f" std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
inputs_autograd_meta_list.append(input_autograd_meta)
compute_require_grad_args_list.append(input_autograd_meta_name)
inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
compute_require_grad_args_str = ",".join(compute_require_grad_args_list)
# Get Output AutoGradMeta
outputs_autograd_meta_list = []
pass_stop_gradient_args_list = ["false"]
num_fwd_outputs = len(forward_outputs_position_map.keys())
for name, (rtype, pos) in forward_outputs_position_map.items():
output_autograd_meta_name = GetAutoGradMetaName(name)
output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
if num_fwd_outputs == 1:
if IsPlainTensorType(rtype):
output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
output_autograd_meta += f" std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
else:
# Tuple api_result
if IsPlainTensorType(rtype):
output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n"
output_autograd_meta += f" std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
outputs_autograd_meta_list.append(output_autograd_meta)
pass_stop_gradient_args_list.append(output_autograd_meta_name)
# ComputeRequireGrad & PassStopGradient
outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list)
# Check Inplace
check_inplace_str = ""
bump_inplace_version_str = ""
for inplace_name in inplace_map.keys():
inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
check_inplace_str += f"""
// Check Inplace
egr::EagerUtils::CheckInplace({inplace_name}, {inplace_autograd_meta_name}, require_any_grad);\n
""" """
bump_inplace_version_str += f""" FUNCTION_TEMPLATE = \
// Bump Inplace Version """
{inplace_name}.bump_inplace_version(); std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
VLOG(3) << \"Tensor(\" << {inplace_name}.name() << \") uses Inplace Strategy.\";\n {}
auto hooked_grads = ApplyGradientHooks(grads);
// Call grad_api function
VLOG(3) << \"Final State Running: \" << \"{}\";
auto grad_api_returns = {}{}({});
{}
}}
""" """
# Node Construction FORWARD_FUNCTION_TEMPLATE = \
num_bwd_inputs = len(backward_grad_input_map.keys()) """
num_bwd_outputs = len(backward_grad_output_map.keys()) {} {}({}) {{
grad_node_name = GetGradNodeName( {}
RecoverBaseNameOfInplaceFunction(
fwd_api_name)) if inplace_map else GetGradNodeName(fwd_api_name) {}
node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
# SetAttributes
set_attributes_list = []
forward_attrs_name_set = set()
for name, _, _, _ in forward_attrs_list:
forward_attrs_name_set.add(name)
for name, _, default_val_attr, _ in backward_attrs_list:
if name in forward_attrs_name_set:
set_attributes = f" grad_node->SetAttribute{name}({name});"
else:
set_attributes = f" grad_node->SetAttribute{name}({default_val_attr});"
set_attributes_list.append(set_attributes)
set_attributes_str = "\n".join(set_attributes_list)
# SetTensorWrappers
set_tensor_wrappers_list = []
for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items():
is_optional = (name in optional_inputs)
if is_fwd_input:
if is_optional:
set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);"
else:
if num_fwd_outputs > 1:
# Aligned with forward output position
assert name in forward_outputs_position_map.keys()
fwd_output_pos = forward_outputs_position_map[name][1]
tw_name = f"std::get<{fwd_output_pos}>(api_result)"
else:
tw_name = f"api_result"
if is_optional: // Returns
set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);" return {};
else: }}
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);"
set_tensor_wrappers_list.append(set_tensor_wrappers)
set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
# SetGradOutMeta & SetEdges
set_grad_out_meta_list = []
set_edges_list = []
for name, (_, pos) in forward_inputs_position_map.items():
input_autograd_meta_name = GetAutoGradMetaName(name)
set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});"
set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});"
set_grad_out_meta_list.append(set_grad_out_meta)
set_edges_list.append(set_edges)
set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
set_edges_str = "\n".join(set_edges_list)
# SetOutRank & SetHistory & SetGradInMeta
set_out_rank_list = []
set_history_list = []
set_grad_in_meta_list = []
set_retain_grad_list = []
num_outputs = len(forward_outputs_position_map.keys())
for name, (_, pos) in forward_outputs_position_map.items():
output_autograd_meta_name = GetAutoGradMetaName(name)
set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
if num_outputs == 1:
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);"
set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});"
else:
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
set_out_rank_list.append(set_out_rank)
set_history_list.append(set_history)
set_grad_in_meta_list.append(set_grad_in_meta)
set_retain_grad_list.append(set_retain_grad)
set_out_rank_str = "\n".join(set_out_rank_list)
set_history_str = "\n".join(set_history_list)
set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
set_retain_grad_str = "\n".join(set_retain_grad_list)
node_event_name = fwd_api_name + " node_creation"
NODE_CREATION_TEMPLATE = """
paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
"""
node_creation_event_str = NODE_CREATION_TEMPLATE.format(node_event_name)
NODE_CREATION_TEMPLATE = """ """
NODE_CREATION_TEMPLATE = \
"""
// Get AutoGradMeta // Get AutoGradMeta
{} {}
bool trace_backward = egr::Controller::Instance().HasGrad(); bool trace_backward = egr::Controller::Instance().HasGrad();
...@@ -924,185 +202,72 @@ def GenerateNodeCreationCodes( ...@@ -924,185 +202,72 @@ def GenerateNodeCreationCodes(
{} {}
}} }}
}} }}
"""
NAMESPACE_WRAPPER_TEMPLATE = \
"""
namespace {} {{
{}
}}
""" """
node_creation_str = NODE_CREATION_TEMPLATE.format(
inputs_autograd_meta_str, compute_require_grad_args_str,
check_inplace_str, forward_call_str, bump_inplace_version_str,
node_creation_event_str, outputs_autograd_meta_str,
pass_stop_gradient_args_str, node_construction_str, set_attributes_str,
set_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str,
set_out_rank_str, set_history_str, set_grad_in_meta_str,
set_retain_grad_str)
return node_creation_str
def GenerateForwardDefinition(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
intermediate_outputs, inplace_map):
# fwd_api_name = ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
# forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# optional_inputs = ["name0", ...]
# Get Function Args
num_inputs = len(forward_attrs_list) + len(forward_inputs_position_map.keys(
))
inputs_args_definition_list = ["" for i in range(num_inputs)]
inputs_args_declaration_list = ["" for i in range(num_inputs)]
inputs_call_list = ["" for i in range(num_inputs)]
for name, (ttype, pos) in forward_inputs_position_map.items():
inputs_call_list[pos] = f"{name}"
is_optional = (name in optional_inputs)
if IsPlainTensorType(ttype):
if is_optional:
arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
else:
if inplace_map and name in inplace_map.keys():
arg_str = f"paddle::experimental::Tensor& {name}"
else:
arg_str = f"const paddle::experimental::Tensor& {name}"
else:
assert IsVectorTensorType(ttype)
arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
inputs_args_definition_list[pos] = arg_str NODE_CC_FILE_TEMPLATE = \
inputs_args_declaration_list[pos] = arg_str """
#include "glog/logging.h"
#include "paddle/phi/api/all.h"
#include "paddle/phi/api/backward/backward_api.h"
#include "paddle/phi/api/backward/sparse_bw_api.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
for name, atype, default_val, pos in forward_attrs_list: #include "paddle/phi/api/include/sparse_api.h"
inputs_call_list[pos] = name
if default_val is not None:
inputs_args_declaration_list[
pos] = f"{atype} {name} = {default_val}"
else:
inputs_args_declaration_list[pos] = f"{atype} {name}"
inputs_args_definition_list[pos] = f"{atype} {name}"
inputs_args_declaration_str = ", ".join(inputs_args_declaration_list)
inputs_args_definition_str = ", ".join(inputs_args_definition_list)
inputs_call_args_str = ", ".join(inputs_call_list)
# Forward Full Logic
if len(intermediate_outputs) == 0:
function_name = fwd_api_name
else:
function_name = fwd_api_name + "_intermediate"
if len(namespace) > 0:
forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});"
else:
forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
# Get return type list & outputs
num_outputs = len(forward_outputs_position_map.keys()) - len(
intermediate_outputs)
returns_type_list = ["" for i in range(num_outputs)]
returns_list = ["" for i in range(num_outputs)]
for name, (rtype, pos) in forward_outputs_position_map.items():
if name in intermediate_outputs:
continue
if num_outputs == 1:
returns_list[0] = f"api_result"
else:
# Tuple api_result
returns_list[pos] = f"std::get<{pos}>(api_result)"
if IsPlainTensorType(rtype):
returns_type_list[pos] = "paddle::experimental::Tensor"
else:
assert IsVectorTensorType(rtype)
returns_type_list[pos] = "std::vector<paddle::experimental::Tensor>"
if num_outputs == 1:
returns_str = returns_list[0]
returns_type_str = returns_type_list[0]
else:
returns_type_str = ", ".join(returns_type_list)
returns_type_str = f"std::tuple<{returns_type_str}>"
returns_str = ", ".join(returns_list)
returns_str = f"std::make_tuple({returns_str})"
node_creation_str = GenerateNodeCreationCodes(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list, forward_call_str,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
inplace_map)
dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
FORWARD_FUNCTION_TEMPLATE = """
{} {}({}) {{
{}
{} {}
// Returns
return {};
}}
""" """
forward_function_name = GetForwardFunctionName(fwd_api_name) NODE_H_FILE_TEMPLATE = \
forward_function_str = FORWARD_FUNCTION_TEMPLATE.format( """
returns_type_str, forward_function_name, inputs_args_definition_str, #pragma once
dygraph_event_str, node_creation_str, returns_str) #include "paddle/fluid/eager/tensor_wrapper.h"
forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});" #include "paddle/fluid/eager/grad_node_info.h"
return forward_function_str, forward_function_declaration_str
def CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list):
# fwd_api_name : ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
# forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list)
num_returns = len(forward_outputs_position_map.keys())
final_state_fwd_api_name = "final_state_" + fwd_api_name
core_ops_returns_info[
final_state_fwd_api_name] = ["" for i in range(num_returns)]
core_ops_args_info[final_state_fwd_api_name] = ["" for i in range(num_args)]
core_ops_args_type_info[
final_state_fwd_api_name] = ["" for i in range(num_args)]
for name, (ttype, pos) in forward_inputs_position_map.items():
core_ops_args_info[final_state_fwd_api_name][pos] = name
if IsPlainTensorType(ttype):
core_ops_args_type_info[final_state_fwd_api_name][pos] = "tensor"
else:
assert IsVectorTensorType(ttype)
core_ops_args_type_info[final_state_fwd_api_name][pos] = "list"
for name, _, _, pos in forward_attrs_list:
core_ops_args_info[final_state_fwd_api_name][pos] = name
for name, (ttype, pos) in forward_outputs_position_map.items(): {}
core_ops_returns_info[final_state_fwd_api_name][pos] = name """
FORWARD_CC_FILE_TEMPLATE = \
"""
#include "paddle/phi/api/lib/dygraph_api.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
def GenerateCoreOpInfoDeclaration(): #include "paddle/phi/api/include/sparse_api.h"
core_ops_declaration_str = """ #include "paddle/fluid/eager/api/utils/global_utils.h"
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info; #include "paddle/fluid/platform/profiler/event_tracing.h"
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_type_info;
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_returns_info;
{}
{}
""" """
return core_ops_declaration_str
FORWARD_H_FILE_TEMPLATE = \
"""
#pragma once
#include "glog/logging.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/phi/api/all.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/eager/to_static/run_program_op_func.h"
def GenerateCoreOpInfoDefinition(): {}
{}
"""
CORE_OPS_INFO_TEMPLATE = """ CORE_OPS_INFO_TEMPLATE = \
"""
std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info = {{ std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info = {{
{} {}
}}; }};
...@@ -1114,6 +279,38 @@ std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_r ...@@ -1114,6 +279,38 @@ std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_r
}}; }};
""" """
CORE_OPS_DECLARATION_TEMPLATE = \
"""
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info;
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_type_info;
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_returns_info;
"""
CHECK_INPLACE_TEMPLATE = \
"""
// Check Inplace
egr::EagerUtils::CheckInplace({}, {}, require_any_grad);\n
"""
BUMP_INPLACE_VERSION_TEMPLATE = \
"""
// Bump Inplace Version
{}.bump_inplace_version();
VLOG(3) << \"Tensor(\" << {}.name() << \") uses Inplace Strategy.\";\n
"""
#######################
## Generator Helpers ##
#######################
def GenerateCoreOpInfoDeclaration():
return CORE_OPS_DECLARATION_TEMPLATE
def GenerateCoreOpInfoDefinition():
op_args_info_list = [] op_args_info_list = []
for op_name, arg_list in core_ops_args_info.items(): for op_name, arg_list in core_ops_args_info.items():
arg_str = ",".join(["\"" + v + "\"" for v in arg_list]) arg_str = ",".join(["\"" + v + "\"" for v in arg_list])
...@@ -1142,68 +339,864 @@ std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_r ...@@ -1142,68 +339,864 @@ std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_r
return core_ops_info_definition_str return core_ops_info_definition_str
#####################
## Generator Class ##
#####################
class DygraphSingleFunctionGenerator(FunctionGeneratorBase):
def __init__(self, forward_api_contents, grad_api_contents, namespace):
self.forward_api_contents = forward_api_contents
# Members from Parent:
#self.namespace
#self.forward_api_contents
#self.forward_api_name
#self.orig_forward_inputs_list
#self.orig_forward_attrs_list
#self.orig_forward_returns_list
#self.forward_inputs_position_map
#self.forward_outputs_position_map
#self.optional_inputs
#self.no_need_buffers
#self.intermediate_outputs
#self.inplace_map
FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
self.grad_api_contents = grad_api_contents
# Raw Contents
self.backward_forward_str = ""
self.backward_api_name = ""
self.forward_attrs_list = [
] #[ [attr_name, attr_type, default_value, orig_position], ...]
self.forward_inputs_list = [
] #[ [arg_name, arg_type, orig_position], ...]
self.forward_returns_list = [
] #[ [ret_name, ret_type, orig_position], ...]
self.backward_inputs_list = [
] #[ [attr_name, attr_type, default_value, orig_position], ...]
self.backward_attrs_list = [
] #[ [arg_name, arg_type, orig_position], ...]
self.backward_returns_list = [
] #[ [ret_name, ret_type, orig_position], ...]
# SlotNameMatched Backward Data
self.backward_forward_inputs_map = {
} #{ "name" : [type, is_fwd_input, orig_position] ...}
self.backward_grad_inputs_map = {
} #{ "name" : [type, fwd_position, orig_position] ...}
self.backward_grad_outputs_map = {
} #{ "name" : [type, fwd_position, orig_position] ...}
# Generated Results
self.forward_definition_str = ""
self.forward_declaration_str = ""
self.node_declaration_str = ""
self.node_definition_str = ""
def DygraphYamlValidationCheck(self):
forward_api_contents = self.forward_api_contents
grad_api_contents = self.grad_api_contents
assert 'api' in forward_api_contents.keys()
assert 'args' in forward_api_contents.keys()
assert 'output' in forward_api_contents.keys()
assert 'backward' in forward_api_contents.keys()
assert 'args' in grad_api_contents.keys()
assert 'output' in grad_api_contents.keys()
assert 'forward' in grad_api_contents.keys()
def ForwardsValidationCheck(self):
forward_inputs_list = self.forward_inputs_list
forward_attrs_list = self.forward_attrs_list
forward_returns_list = self.forward_returns_list
orig_forward_inputs_list = self.orig_forward_inputs_list
orig_forward_attrs_list = self.orig_forward_attrs_list
orig_forward_returns_list = self.orig_forward_returns_list
for i in range(len(forward_inputs_list)):
forward_input_name = forward_inputs_list[i][0]
forward_input_type = forward_inputs_list[i][1]
forward_input_pos = forward_inputs_list[i][2]
orig_input_name = orig_forward_inputs_list[i][0]
orig_input_type = orig_forward_inputs_list[i][1]
orig_input_pos = orig_forward_inputs_list[i][2]
assert forward_input_type == orig_input_type
assert forward_input_pos == orig_input_pos
for i in range(len(forward_attrs_list)):
orig_attr_name = orig_forward_attrs_list[i][0]
orig_attr_type = orig_forward_attrs_list[i][1]
orig_attr_default = orig_forward_attrs_list[i][2]
orig_attr_pos = orig_forward_attrs_list[i][3]
forward_attr_name = forward_attrs_list[i][0]
forward_attr_type = forward_attrs_list[i][1]
forward_attr_default = forward_attrs_list[i][2]
forward_attr_pos = forward_attrs_list[i][3]
assert orig_attr_type == forward_attr_type
assert orig_attr_default == forward_attr_default
assert orig_attr_pos == forward_attr_pos
for i in range(len(forward_returns_list)):
orig_return_type = orig_forward_returns_list[i][1]
orig_return_pos = orig_forward_returns_list[i][2]
forward_return_type = forward_returns_list[i][1]
forward_return_pos = forward_returns_list[i][2]
assert orig_return_type == forward_return_type
assert orig_return_pos == forward_return_pos
# Check Order: Inputs, Attributes
max_input_position = -1
for _, _, pos in forward_inputs_list:
max_input_position = max(max_input_position, pos)
max_attr_position = -1
for _, _, _, pos in forward_attrs_list:
assert pos > max_input_position
max_attr_position = max(max_attr_position, pos)
def BackwardValidationCheck(self):
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_grad_inputs_map = self.backward_grad_inputs_map
backward_attrs_list = self.backward_attrs_list
# Check Order: TensorWrappers, GradTensors, Attributes
max_fwd_input_position = -1
for _, (_, _, pos) in backward_forward_inputs_map.items():
max_fwd_input_position = max(max_fwd_input_position, pos)
max_grad_tensor_position = -1
for _, (_, _, pos) in backward_grad_inputs_map.items():
assert pos > max_fwd_input_position
max_grad_tensor_position = max(max_grad_tensor_position, pos)
max_attr_position = -1
for _, _, _, pos in backward_attrs_list:
assert pos > max_grad_tensor_position
max_attr_position = max(max_attr_position, pos)
def IntermediateValidationCheck(self):
intermediate_outputs = self.intermediate_outputs
forward_returns_list = self.forward_returns_list
"""
Check whether intermediate_outputs are positioned
at the very end of forward_returns_list
"""
intermediate_positions = range(
len(forward_returns_list) - len(intermediate_outputs),
len(forward_returns_list))
for ret_name, _, pos in forward_returns_list:
if ret_name in intermediate_outputs:
assert pos in intermediate_positions
def CollectBackwardInfo(self):
forward_api_contents = self.forward_api_contents
grad_api_contents = self.grad_api_contents
self.backward_api_name = forward_api_contents['backward']
self.backward_forward_str = grad_api_contents['forward']
backward_args_str = grad_api_contents['args']
backward_returns_str = grad_api_contents['output']
self.backward_inputs_list, self.backward_attrs_list, self.backward_returns_list = ParseYamlBackward(
backward_args_str, backward_returns_str)
print("Parsed Backward Inputs List: ", self.backward_inputs_list)
print("Prased Backward Attrs List: ", self.backward_attrs_list)
print("Parsed Backward Returns List: ", self.backward_returns_list)
def CollectForwardInfoFromBackwardContents(self):
backward_forward_str = self.backward_forward_str
self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForwardFromBackward(
backward_forward_str)
def SlotNameMatching(self):
backward_inputs_list = self.backward_inputs_list
backward_returns_list = self.backward_returns_list
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
for backward_input in backward_inputs_list:
backward_input_name = backward_input[0]
backward_input_type = backward_input[1]
backward_input_pos = backward_input[2]
backward_fwd_name = FindForwardName(backward_input_name)
if backward_fwd_name:
# Grad Input
assert backward_fwd_name in forward_outputs_position_map.keys()
matched_forward_output_type = forward_outputs_position_map[
backward_fwd_name][0]
matched_forward_output_pos = forward_outputs_position_map[
backward_fwd_name][1]
self.backward_grad_inputs_map[backward_input_name] = [
backward_input_type, matched_forward_output_pos,
backward_input_pos
]
else:
# TensorWrapper Input
if backward_input_name in forward_inputs_position_map.keys():
tensor_wrapper_type = forward_inputs_position_map[
backward_input_name][0]
self.backward_forward_inputs_map[backward_input_name] = [
backward_input_type, True, backward_input_pos
]
elif backward_input_name in forward_outputs_position_map.keys():
tensor_wrapper_type = forward_outputs_position_map[
backward_input_name][0]
self.backward_forward_inputs_map[backward_input_name] = [
backward_input_type, False, backward_input_pos
]
else:
assert False, backward_input_name
for backward_output in backward_returns_list:
backward_output_name = backward_output[0]
backward_output_type = backward_output[1]
backward_output_pos = backward_output[2]
backward_fwd_name = FindForwardName(backward_output_name)
assert backward_fwd_name is not None
assert backward_fwd_name in forward_inputs_position_map.keys(
), f"Unable to find {backward_fwd_name} in forward inputs"
matched_forward_input_type = forward_inputs_position_map[
backward_fwd_name][0]
matched_forward_input_pos = forward_inputs_position_map[
backward_fwd_name][1]
self.backward_grad_outputs_map[backward_output_name] = [
backward_output_type, matched_forward_input_pos,
backward_output_pos
]
print("Generated Backward Fwd Input Map: ",
self.backward_forward_inputs_map)
print("Generated Backward Grad Input Map: ",
self.backward_grad_inputs_map)
print("Generated Backward Grad Output Map: ",
self.backward_grad_outputs_map)
def GenerateNodeDeclaration(self):
forward_op_name = self.forward_api_name
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_attrs_list = self.backward_attrs_list
no_need_buffers = self.no_need_buffers
# SetTensorWrapper Methods & TensorWrapper Members
set_tensor_wrapper_methods_str = ""
tensor_wrapper_members_str = ""
clear_tensor_wrapper_str = ""
for tname, (ttype, is_fwd_input,
_) in backward_forward_inputs_map.items():
no_need_buffer = "true" if tname in no_need_buffers else "false"
tensor_wrapper_name = GetSavedName(tname)
if IsPlainTensorType(ttype):
set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tensor_wrapper_name, tname, no_need_buffer)
tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPER_TEMPLATE.format(
tensor_wrapper_name)
else:
assert IsVectorTensorType(ttype)
set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tname, tensor_wrapper_name, no_need_buffer)
tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
clear_tensor_wrapper_str += CLEAR_VECTOR_TENSOR_WRAPPERS_TEMPLATE.format(
tensor_wrapper_name)
# SetAttributes & Attribute Members
set_attribute_methods_str = ""
attribute_members_str = ""
for aname, atype, default_val, _ in backward_attrs_list:
saved_attr_name = GetSavedName(aname)
set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format(
aname, GetConstReference(atype), aname, saved_attr_name, aname)
if default_val:
attribute_members_str += ATTRIBUTE_MEMBER_WITH_DEFAULT_TEMPLATE.format(
RemoveConstAndReference(atype), saved_attr_name,
default_val)
else:
attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
RemoveConstAndReference(atype), saved_attr_name)
grad_node_name = GetGradNodeName(forward_op_name)
self.node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_node_name, grad_node_name,
grad_node_name, clear_tensor_wrapper_str,
set_tensor_wrapper_methods_str, set_attribute_methods_str,
tensor_wrapper_members_str, attribute_members_str)
print("Generated Node Declaration: ", self.node_declaration_str)
def GenerateNodeDefinition(self):
namespace = self.namespace
forward_api_name = self.forward_api_name
backward_api_name = self.backward_api_name
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_grad_inputs_map = self.backward_grad_inputs_map
backward_grad_outputs_map = self.backward_grad_outputs_map
backward_attrs_list = self.backward_attrs_list
# Construct grad_api function args
# Order: TensorWrappers, GradTensors, Attributes
grad_api_args_len = len(backward_forward_inputs_map.keys()) + len(
backward_grad_inputs_map.keys()) + len(backward_attrs_list)
grad_api_args = ["" for i in range(grad_api_args_len)]
for name, (_, is_fwd_input,
grad_api_position), in backward_forward_inputs_map.items():
tensor_wrapper_name = GetSavedName(name)
grad_api_args[
grad_api_position] = f"egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, nullptr)"
for _, (ttype, fwd_position,
grad_api_position) in backward_grad_inputs_map.items():
if IsPlainTensorType(ttype):
grad_api_args[
grad_api_position] = f"hooked_grads[{fwd_position}][0]"
else:
assert IsVectorTensorType(ttype)
grad_api_args[
grad_api_position] = f"hooked_grads[{fwd_position}]"
for name, _, _, grad_api_position in backward_attrs_list:
saved_attribute_name = GetSavedName(name)
grad_api_args[grad_api_position] = f"this->{saved_attribute_name}"
grad_api_args_str = ", ".join(grad_api_args)
# Construct grad_api returns
num_bwd_outputs = len(backward_grad_outputs_map.keys())
returns_str = f"std::vector<std::vector<paddle::experimental::Tensor>> returns({num_bwd_outputs});\n"
for _, (ttype, fwd_position,
grad_api_position) in backward_grad_outputs_map.items():
# Infer Grad API Return Type
if num_bwd_outputs == 1:
# Single tensor output, return as is
if IsPlainTensorType(ttype):
returns_str += "returns[0] = { grad_api_returns };\n"
else:
assert IsVectorTensorType(ttype)
returns_str += "returns[0] = grad_api_returns;\n"
else:
# Rearrange output order accordingly
returns_str += f"returns[{fwd_position}] = grad_api_returns[{grad_api_position}];\n"
returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
returns_str += f"return returns;\n"
grad_node_name = GetGradNodeName(forward_api_name)
fill_zero_str = ""
if forward_api_name in ops_to_fill_zero_for_empty_grads:
fill_zero_str = "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n"
grad_api_namespace = f"paddle::experimental::{namespace}"
self.node_definition_str = FUNCTION_TEMPLATE.format(
grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace,
backward_api_name, grad_api_args_str, returns_str)
print("Generated Node Definition: ", self.node_definition_str)
def GenerateForwardDefinition(self, is_inplaced):
namespace = self.namespace
forward_api_name = GetInplacedFunctionName(
self.forward_api_name) if is_inplaced else self.forward_api_name
backward_api_name = self.backward_api_name
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
forward_attrs_list = self.forward_attrs_list
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_grad_inputs_map = self.backward_grad_inputs_map
backward_grad_outputs_map = self.backward_grad_outputs_map
backward_attrs_list = self.backward_attrs_list
optional_inputs = self.optional_inputs
intermediate_outputs = self.intermediate_outputs
inplace_map = self.inplace_map
# Get Function Args
num_inputs = len(forward_attrs_list) + len(
forward_inputs_position_map.keys())
inputs_args_definition_list = ["" for i in range(num_inputs)]
inputs_args_declaration_list = ["" for i in range(num_inputs)]
inputs_call_list = ["" for i in range(num_inputs)]
for name, (ttype, pos) in forward_inputs_position_map.items():
inputs_call_list[pos] = f"{name}"
is_optional = (name in optional_inputs)
if IsPlainTensorType(ttype):
if is_optional:
arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
else:
if inplace_map and name in inplace_map.keys():
arg_str = f"paddle::experimental::Tensor& {name}"
else:
arg_str = f"const paddle::experimental::Tensor& {name}"
else:
assert IsVectorTensorType(ttype)
arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
inputs_args_definition_list[pos] = arg_str
inputs_args_declaration_list[pos] = arg_str
for name, atype, default_val, pos in forward_attrs_list:
inputs_call_list[pos] = name
if default_val is not None:
inputs_args_declaration_list[
pos] = f"{atype} {name} = {default_val}"
else:
inputs_args_declaration_list[pos] = f"{atype} {name}"
inputs_args_definition_list[pos] = f"{atype} {name}"
inputs_args_declaration_str = ", ".join(inputs_args_declaration_list)
inputs_args_definition_str = ", ".join(inputs_args_definition_list)
inputs_call_args_str = ", ".join(inputs_call_list)
# Forward Full Logic
function_name = forward_api_name
if len(intermediate_outputs) > 0:
function_name = GetIntermediateAPIFunctionName(function_name)
forward_call_str = f"auto api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});"
# Get return type list & outputs
num_outputs = len(forward_outputs_position_map.keys()) - len(
intermediate_outputs)
returns_type_list = ["" for i in range(num_outputs)]
returns_list = ["" for i in range(num_outputs)]
for name, (rtype, pos) in forward_outputs_position_map.items():
if name in intermediate_outputs:
continue
if num_outputs == 1:
returns_list[0] = f"api_result"
else:
# Tuple api_result
returns_list[pos] = f"std::get<{pos}>(api_result)"
if IsPlainTensorType(rtype):
returns_type_list[pos] = "paddle::experimental::Tensor"
else:
assert IsVectorTensorType(rtype)
returns_type_list[
pos] = "std::vector<paddle::experimental::Tensor>"
if num_outputs == 1:
returns_str = returns_list[0]
returns_type_str = returns_type_list[0]
else:
returns_type_str = ", ".join(returns_type_list)
returns_type_str = f"std::tuple<{returns_type_str}>"
returns_str = ", ".join(returns_list)
returns_str = f"std::make_tuple({returns_str})"
self.GenerateNodeCreationCodes(forward_call_str)
node_creation_str = self.node_creation_str
dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{forward_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
forward_function_name = GetDygraphForwardFunctionName(forward_api_name)
self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format(
returns_type_str, forward_function_name, inputs_args_definition_str,
dygraph_event_str, node_creation_str, returns_str)
self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n"
print("Generated Forward Definition: ", self.forward_definition_str)
print("Generated Forward Declaration: ", self.forward_declaration_str)
def GenerateNodeCreationCodes(self, forward_call_str):
forward_api_name = self.forward_api_name
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
forward_attrs_list = self.forward_attrs_list
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_grad_inputs_map = self.backward_grad_inputs_map
backward_grad_outputs_map = self.backward_grad_outputs_map
backward_attrs_list = self.backward_attrs_list
optional_inputs = self.optional_inputs
inplace_map = self.inplace_map
# Get Input AutoGradMeta
inputs_autograd_meta_list = []
compute_require_grad_args_list = ["trace_backward"]
for name, (ttype, pos) in forward_inputs_position_map.items():
input_autograd_meta_name = GetAutoGradMetaName(name)
if IsPlainTensorType(ttype):
input_autograd_meta = f" egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
else:
assert IsVectorTensorType(ttype)
input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
input_autograd_meta = f" std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
input_autograd_meta += f" std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
inputs_autograd_meta_list.append(input_autograd_meta)
compute_require_grad_args_list.append(input_autograd_meta_name)
inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
compute_require_grad_args_str = ",".join(compute_require_grad_args_list)
# Get Output AutoGradMeta
outputs_autograd_meta_list = []
pass_stop_gradient_args_list = ["false"]
num_fwd_outputs = len(forward_outputs_position_map.keys())
for name, (rtype, pos) in forward_outputs_position_map.items():
output_autograd_meta_name = GetAutoGradMetaName(name)
output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
if num_fwd_outputs == 1:
if IsPlainTensorType(rtype):
output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
output_autograd_meta += f" std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
else:
# Tuple api_result
if IsPlainTensorType(rtype):
output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n"
output_autograd_meta += f" std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
outputs_autograd_meta_list.append(output_autograd_meta)
pass_stop_gradient_args_list.append(output_autograd_meta_name)
# ComputeRequireGrad & PassStopGradient
outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list)
# Check Inplace
check_inplace_str = ""
bump_inplace_version_str = ""
for inplace_name in inplace_map.keys():
inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
check_inplace_str += CHECK_INPLACE_TEMPLATE.format(
inplace_name, inplace_autograd_meta_name)
bump_inplace_version_str += BUMP_INPLACE_VERSION_TEMPLATE.format(
inplace_name, inplace_name)
# Node Construction
num_backward_inputs = len(backward_grad_inputs_map.keys())
num_backward_outputs = len(backward_grad_outputs_map.keys())
grad_node_name = GetGradNodeName(forward_api_name)
node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_backward_inputs}, {num_backward_outputs});"
# SetAttributes
set_attributes_list = []
forward_attrs_name_set = set()
for name, _, _, _ in forward_attrs_list:
forward_attrs_name_set.add(name)
for name, _, default_val_attr, _ in backward_attrs_list:
if name in forward_attrs_name_set:
set_attributes = f" grad_node->SetAttribute{name}({name});"
else:
set_attributes = f" grad_node->SetAttribute{name}({default_val_attr});"
set_attributes_list.append(set_attributes)
set_attributes_str = "\n".join(set_attributes_list)
# SetTensorWrappers
set_tensor_wrappers_list = []
for name, (atype, is_fwd_input,
pos) in backward_forward_inputs_map.items():
is_optional = (name in optional_inputs)
if is_fwd_input:
if is_optional:
set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);"
else:
if num_fwd_outputs > 1:
# Aligned with forward output position
assert name in forward_outputs_position_map.keys()
fwd_output_pos = forward_outputs_position_map[name][1]
tw_name = f"std::get<{fwd_output_pos}>(api_result)"
else:
tw_name = f"api_result"
if is_optional:
set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);"
set_tensor_wrappers_list.append(set_tensor_wrappers)
set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
# SetGradOutMeta & SetEdges
set_grad_out_meta_list = []
set_edges_list = []
for name, (_, pos) in forward_inputs_position_map.items():
input_autograd_meta_name = GetAutoGradMetaName(name)
set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});"
set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});"
set_grad_out_meta_list.append(set_grad_out_meta)
set_edges_list.append(set_edges)
set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
set_edges_str = "\n".join(set_edges_list)
# SetOutRank & SetHistory & SetGradInMeta
set_out_rank_list = []
set_history_list = []
set_grad_in_meta_list = []
set_retain_grad_list = []
num_outputs = len(forward_outputs_position_map.keys())
for name, (_, pos) in forward_outputs_position_map.items():
output_autograd_meta_name = GetAutoGradMetaName(name)
set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
if num_outputs == 1:
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);"
set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});"
else:
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
set_out_rank_list.append(set_out_rank)
set_history_list.append(set_history)
set_grad_in_meta_list.append(set_grad_in_meta)
set_retain_grad_list.append(set_retain_grad)
set_out_rank_str = "\n".join(set_out_rank_list)
set_history_str = "\n".join(set_history_list)
set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
set_retain_grad_str = "\n".join(set_retain_grad_list)
node_event_name = forward_api_name + " node_creation"
node_creation_event_str = f"paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::Operator, 1);\n"
self.node_creation_str = NODE_CREATION_TEMPLATE.format(
inputs_autograd_meta_str, compute_require_grad_args_str,
check_inplace_str, forward_call_str, bump_inplace_version_str,
node_creation_event_str, outputs_autograd_meta_str,
pass_stop_gradient_args_str, node_construction_str,
set_attributes_str, set_tensor_wrappers_str, set_grad_out_meta_str,
set_edges_str, set_out_rank_str, set_history_str,
set_grad_in_meta_str, set_retain_grad_str)
def GenerateInplacedForwardDygraphFunctions(self):
# Inplaced Version Dygraph Function Generation
forward_api_name = self.forward_api_name
forward_api_contents = self.forward_api_contents
if forward_api_name != "sum" and "inplace" in forward_api_contents.keys(
):
# Node Definition Generation
self.GenerateForwardDefinition(is_inplaced=True)
self.UpdateCoreOpsInformation(is_inplaced=True)
def UpdateCoreOpsInformation(self, is_inplaced):
forward_api_name = GetInplacedFunctionName(
self.forward_api_name) if is_inplaced else self.forward_api_name
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
forward_attrs_list = self.forward_attrs_list
num_args = len(forward_inputs_position_map.keys()) + len(
forward_attrs_list)
num_returns = len(forward_outputs_position_map.keys())
final_state_fwd_api_name = "final_state_" + forward_api_name
core_ops_returns_info[
final_state_fwd_api_name] = ["" for i in range(num_returns)]
core_ops_args_info[
final_state_fwd_api_name] = ["" for i in range(num_args)]
core_ops_args_type_info[
final_state_fwd_api_name] = ["" for i in range(num_args)]
for name, (ttype, pos) in forward_inputs_position_map.items():
core_ops_args_info[final_state_fwd_api_name][pos] = name
if IsPlainTensorType(ttype):
core_ops_args_type_info[final_state_fwd_api_name][
pos] = "tensor"
else:
assert IsVectorTensorType(ttype)
core_ops_args_type_info[final_state_fwd_api_name][pos] = "list"
for name, _, _, pos in forward_attrs_list:
core_ops_args_info[final_state_fwd_api_name][pos] = name
for name, (ttype, pos) in forward_outputs_position_map.items():
core_ops_returns_info[final_state_fwd_api_name][pos] = name
def run(self):
# Basic Validation Check
self.DygraphYamlValidationCheck()
##########################
## Parsing Raw Contents ##
##########################
# Parse inplace_map
self.ParseInplaceInfo()
# Parse no_need_buffer
self.ParseNoNeedBuffer()
# Parse optional_inputs
self.ParseDispensable()
# Parse intermediate_outputs
self.ParseIntermediate()
self.IntermediateValidationCheck()
# Initialize backward_forward_str, backward_inputs_list, backward_attrs_list, backward_returns_list
self.CollectBackwardInfo()
# Initialize forward_inputs_list, forward_attrs_list, forward_returns_list
self.CollectForwardInfoFromBackwardContents()
# Initialize orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list
self.CollectOriginalForwardInfo()
# Forwards Validation Check
self.ForwardsValidationCheck()
#############################
## Process Parsed Contents ##
#############################
# Initialize forward_inputs_position_map, forward_outputs_position_map
self.DetermineForwardPositionMap(self.forward_inputs_list,
self.forward_returns_list)
# Initialize forward_inputs_position_map, forward_outputs_position_map
self.SlotNameMatching()
# Backward Validation Check
self.BackwardValidationCheck()
#####################
## Code Generation ##
#####################
self.GenerateNodeDeclaration()
self.GenerateNodeDefinition()
self.GenerateForwardDefinition(is_inplaced=False)
self.UpdateCoreOpsInformation(is_inplaced=False)
self.GenerateInplacedForwardDygraphFunctions()
class DygraphYamlGenerator(YamlGeneratorBase):
def __init__(self, api_yaml_path, backward_yaml_path):
# Parent members:
# self.namespace
# self.api_yaml_path
# self.forward_api_list
YamlGeneratorBase.__init__(self, api_yaml_path)
self.backward_yaml_path = backward_yaml_path
self.grad_api_dict = {}
self.forward_definition_str = ""
self.forward_declaration_str = ""
self.node_declaration_str = ""
self.node_definition_str = ""
def ParseYamlContents(self):
self.ParseForwardYamlContents()
backward_yaml_path = self.backward_yaml_path
self.grad_api_dict = ReadBwdFile(backward_yaml_path)
def GetBackwardAPIContents(self, forward_api_contents):
grad_api_dict = self.grad_api_dict
if 'backward' not in forward_api_contents.keys(): return None
backward_api_name = forward_api_contents['backward']
assert backward_api_name in grad_api_dict.keys()
backward_api_contents = grad_api_dict[backward_api_name]
return backward_api_contents
def GenerateCode(self):
forward_api_list = self.forward_api_list
grad_api_dict = self.grad_api_dict
namespace = self.namespace
for forward_api_contents in forward_api_list:
backward_api_contents = self.GetBackwardAPIContents(
forward_api_contents)
if backward_api_contents is None: continue
d_generator = DygraphSingleFunctionGenerator(
forward_api_contents, backward_api_contents, namespace)
d_generator.run()
self.forward_definition_str += d_generator.forward_definition_str + "\n"
self.forward_declaration_str += d_generator.forward_declaration_str + "\n"
self.node_declaration_str += d_generator.node_declaration_str + "\n"
self.node_definition_str += d_generator.node_definition_str + "\n"
if len(namespace) > 0:
if namespace.endswith("::"):
namespace = namespace[:-2]
self.forward_definition_str = NAMESPACE_WRAPPER_TEMPLATE.format(
namespace, self.forward_definition_str)
self.forward_declaration_str = NAMESPACE_WRAPPER_TEMPLATE.format(
namespace, self.forward_declaration_str)
self.node_declaration_str = NAMESPACE_WRAPPER_TEMPLATE.format(
namespace, self.node_declaration_str)
self.node_definition_str = NAMESPACE_WRAPPER_TEMPLATE.format(
namespace, self.node_definition_str)
def run(self):
self.ParseYamlContents()
self.InferNameSpace()
self.GenerateCode()
##################
## File Writers ##
##################
def GenerateNodeCCFile(filepath, node_definition_str): def GenerateNodeCCFile(filepath, node_definition_str):
file_contents = """ if os.path.exists(filepath):
#include "glog/logging.h" os.remove(filepath)
#include "paddle/phi/api/all.h"
#include "paddle/phi/api/backward/backward_api.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
#include "paddle/phi/api/backward/sparse_bw_api.h" file_contents = NODE_CC_FILE_TEMPLATE.format(node_definition_str)
"""
file_contents += node_definition_str
with open(filepath, 'a') as f: with open(filepath, 'a') as f:
f.write(file_contents) f.write(file_contents)
def GenerateNodeHFile(filepath, node_declaration_str): def GenerateNodeHFile(filepath, node_declaration_str):
file_contents = """ if os.path.exists(filepath):
#pragma once os.remove(filepath)
#include "paddle/fluid/eager/tensor_wrapper.h"
#include "paddle/fluid/eager/grad_node_info.h"
""" file_contents = NODE_H_FILE_TEMPLATE.format(node_declaration_str)
file_contents += node_declaration_str
with open(filepath, 'a') as f: with open(filepath, 'a') as f:
f.write(file_contents) f.write(file_contents)
def GenerateForwardCCFile(filepath, forward_definition_str): def GenerateForwardCCFile(filepath, forward_definition_str):
file_contents = """ if os.path.exists(filepath):
#include "paddle/phi/api/lib/dygraph_api.h" os.remove(filepath)
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
"""
file_contents += GenerateCoreOpInfoDefinition() core_ops_info_str = GenerateCoreOpInfoDefinition()
file_contents += forward_definition_str file_contents = FORWARD_CC_FILE_TEMPLATE.format(core_ops_info_str,
forward_definition_str)
with open(filepath, 'a') as f: with open(filepath, 'a') as f:
f.write(file_contents) f.write(file_contents)
def GenerateForwardHFile(filepath, forward_function_declaration_str): def GenerateForwardHFile(filepath, forward_function_declaration_str):
file_contents = """ if os.path.exists(filepath):
#pragma once os.remove(filepath)
#include "glog/logging.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/phi/api/all.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/eager/to_static/run_program_op_func.h"
""" core_ops_info_str = GenerateCoreOpInfoDeclaration()
file_contents += GenerateCoreOpInfoDeclaration() file_contents = FORWARD_H_FILE_TEMPLATE.format(
file_contents += forward_function_declaration_str core_ops_info_str, forward_function_declaration_str)
with open(filepath, 'a') as f: with open(filepath, 'a') as f:
f.write(file_contents) f.write(file_contents)
...@@ -1224,199 +1217,13 @@ if __name__ == "__main__": ...@@ -1224,199 +1217,13 @@ if __name__ == "__main__":
api_yaml_path = api_yaml_paths[i] api_yaml_path = api_yaml_paths[i]
backward_yaml_path = backward_yaml_paths[i] backward_yaml_path = backward_yaml_paths[i]
if "sparse" in api_yaml_path: generator = DygraphYamlGenerator(api_yaml_path, backward_yaml_path)
assert "sparse" in backward_yaml_path generator.run()
namespace = "sparse"
else:
namespace = ""
fwd_api_list = ReadFwdFile(api_yaml_path)
grad_api_dict = ReadBwdFile(backward_yaml_path)
yaml_forward_definition_str = ""
yaml_forward_declaration_str = ""
yaml_node_declaration_str = ""
yaml_node_definition_str = ""
for fwd_api in fwd_api_list:
# We only generate Ops with grad
if 'backward' not in fwd_api.keys():
continue
assert 'api' in fwd_api.keys() node_declaration_str += generator.node_declaration_str + "\n"
assert 'args' in fwd_api.keys() node_definition_str += generator.node_definition_str + "\n"
assert 'output' in fwd_api.keys() forward_definition_str += generator.forward_definition_str + "\n"
assert 'backward' in fwd_api.keys() forward_declaration_str += generator.forward_declaration_str + "\n"
no_need_buffer_set = set()
if 'no_need_buffer' in fwd_api.keys():
no_need_buffer_set = ParseNoNeedBuffer(fwd_api[
'no_need_buffer'])
fwd_api_name = fwd_api['api']
fwd_args_str = fwd_api['args']
fwd_returns_str = fwd_api['output']
inplace_map = {}
if 'inplace' in fwd_api.keys():
inplace_map = ParseInplaceInfo(fwd_api['inplace'])
bwd_api_name = fwd_api['backward']
assert bwd_api_name in grad_api_dict.keys(), bwd_api_name
bwd_api = grad_api_dict[bwd_api_name]
assert 'args' in bwd_api.keys()
assert 'output' in bwd_api.keys()
assert 'forward' in bwd_api.keys()
# Parse Dispensable Inputs
optional_inputs = []
if 'optional' in fwd_api.keys():
optional_inputs = ParseDispensable(fwd_api['optional'])
bwd_forward_str = bwd_api['forward']
bwd_args_str = bwd_api['args']
bwd_returns_str = bwd_api['output']
# Collect Forward Inputs/Outputs
forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
bwd_forward_str)
print("Parsed Forward Inputs List: ", forward_inputs_list)
print("Prased Forward Attrs List: ", forward_attrs_list)
print("Parsed Forward Returns List: ", forward_returns_list)
intermediate_outputs = []
if 'intermediate' in fwd_api.keys():
intermediate_outputs = ParseIntermediate(fwd_api[
'intermediate'])
IntermediateValidationCheck(intermediate_outputs,
forward_returns_list)
# Collect Original Forward Inputs/Outputs and then perform validation checks
orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
fwd_args_str, fwd_returns_str)
print("Parsed Original Forward Inputs List: ",
orig_forward_inputs_list)
print("Prased Original Forward Attrs List: ",
orig_forward_attrs_list)
print("Parsed Original Forward Returns List: ",
orig_forward_returns_list)
# Forward Validation Checks
ForwardsValidationCheck(
forward_inputs_list, forward_attrs_list, forward_returns_list,
orig_forward_inputs_list, orig_forward_attrs_list,
orig_forward_returns_list)
# Parse Backward Inputs/Outputs
backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
bwd_args_str, bwd_returns_str)
print("Parsed Backward Inputs List: ", backward_inputs_list)
print("Prased Backward Attrs List: ", backward_attrs_list)
print("Parsed Backward Returns List: ", backward_returns_list)
# Determine Forward Inputs/Outputs Position
forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
forward_inputs_list, forward_returns_list)
print("Generated Forward Input Position Map: ",
forward_inputs_position_map)
print("Generated Forward Output Position Map: ",
forward_outputs_position_map)
# SlotName Matching
backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
backward_inputs_list, backward_returns_list,
forward_inputs_position_map, forward_outputs_position_map)
print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
print("Generated Backward Grad Input Map: ",
backward_grad_input_map)
print("Generated Backward Grad Output Map: ",
backward_grad_output_map)
# Backward Validation Check
BackwardValidationCheck(backward_fwd_input_map,
backward_grad_input_map,
backward_attrs_list)
# Node Declaration Generation
yaml_node_declaration_str += GenerateNodeDeclaration(
fwd_api_name, backward_fwd_input_map, backward_attrs_list,
no_need_buffer_set)
print("Generated Node Declaration: ", node_declaration_str)
yaml_node_definition_str += GenerateNodeDefinition(
fwd_api_name, bwd_api_name, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map,
backward_attrs_list)
print("Generated Node Definition: ", node_definition_str)
# Node Definition Generation
definition_declaration_pair = GenerateForwardDefinition(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, orig_forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
intermediate_outputs, {})
print("Generated Forward Definition: ", forward_definition_str)
print("Generated Forward Declaration: ", forward_declaration_str)
yaml_forward_definition_str += definition_declaration_pair[0]
yaml_forward_declaration_str += definition_declaration_pair[1]
# For python-level API dispatch
CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
forward_outputs_position_map,
orig_forward_attrs_list)
# Inplaced Version Dygraph Function Generation
if fwd_api_name != "sum" and "inplace" in fwd_api.keys():
fwd_api_name_inplaced = GetInplacedFunctionName(fwd_api_name)
# Node Definition Generation
definition_declaration_pair = GenerateForwardDefinition(
fwd_api_name_inplaced, bwd_api_name,
forward_inputs_position_map, forward_outputs_position_map,
forward_attrs_list, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map,
backward_attrs_list, optional_inputs, intermediate_outputs,
inplace_map)
print("Generated Inplaced Forward Definition: ",
forward_definition_str)
print("Generated Inplaced Forward Declaration: ",
forward_declaration_str)
forward_definition_str += definition_declaration_pair[0]
forward_declaration_str += definition_declaration_pair[1]
# For python-level API dispatch
CollectCoreOpsInformation(
fwd_api_name_inplaced, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list)
if len(namespace) > 0:
forward_definition_str += f"""namespace {namespace} {{
{yaml_forward_definition_str}
}}
"""
forward_declaration_str += f"""namespace {namespace} {{
{yaml_forward_declaration_str}
}}
"""
node_declaration_str += f"""namespace {namespace} {{
{yaml_node_declaration_str}
}}
"""
node_definition_str += f"""namespace {namespace} {{
{yaml_node_definition_str}
}}
"""
else:
forward_definition_str += yaml_forward_definition_str
forward_declaration_str += yaml_forward_declaration_str
node_declaration_str += yaml_node_declaration_str
node_definition_str += yaml_node_definition_str
# Generate Files # Generate Files
nodes_h_path = args.nodes_h_path nodes_h_path = args.nodes_h_path
...@@ -1424,12 +1231,6 @@ if __name__ == "__main__": ...@@ -1424,12 +1231,6 @@ if __name__ == "__main__":
forwards_h_path = args.forwards_h_path forwards_h_path = args.forwards_h_path
forwards_cc_path = args.forwards_cc_path forwards_cc_path = args.forwards_cc_path
for path in [
nodes_cc_path, nodes_h_path, forwards_h_path, forwards_cc_path
]:
if os.path.exists(path):
os.remove(path)
GenerateNodeCCFile(nodes_cc_path, node_definition_str) GenerateNodeCCFile(nodes_cc_path, node_definition_str)
GenerateNodeHFile(nodes_h_path, node_declaration_str) GenerateNodeHFile(nodes_h_path, node_declaration_str)
GenerateForwardCCFile(forwards_cc_path, forward_definition_str) GenerateForwardCCFile(forwards_cc_path, forward_definition_str)
......
...@@ -15,7 +15,10 @@ ...@@ -15,7 +15,10 @@
import os import os
import argparse import argparse
import logging import logging
from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap, GetInplacedFunctionName, ParseInplaceInfo from codegen_utils import FunctionGeneratorBase, YamlGeneratorBase
from codegen_utils import yaml_types_mapping
from codegen_utils import ReadFwdFile, IsVectorTensorType, GetForwardFunctionName
from codegen_utils import ParseYamlForward, GetInplacedFunctionName
########################### ###########################
## Global Configurations ## ## Global Configurations ##
...@@ -121,7 +124,10 @@ FUNCTION_NAME_TEMPLATE = \ ...@@ -121,7 +124,10 @@ FUNCTION_NAME_TEMPLATE = \
PYTHON_C_FUNCTION_REG_TEMPLATE = \ PYTHON_C_FUNCTION_REG_TEMPLATE = \
"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}" """
{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}
"""
PYTHON_C_WRAPPER_TEMPLATE = \ PYTHON_C_WRAPPER_TEMPLATE = \
...@@ -229,77 +235,39 @@ NAMESPACE_WRAPPER_TEMPLATE = \ ...@@ -229,77 +235,39 @@ NAMESPACE_WRAPPER_TEMPLATE = \
####################### #######################
## Generator Classes ## ## Generator Classes ##
####################### #######################
class PythonCSingleFunctionGenerator: class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
def __init__(self, fwd_api_contents, namespace): def __init__(self, forward_api_contents, namespace):
self.fwd_api_contents = fwd_api_contents # Members from Parent:
self.namespace = namespace #self.namespace
#self.forward_api_contents
# Raw Contents #self.forward_api_name
self.forward_api_name = "" #self.orig_forward_inputs_list
self.forward_args_str = "" #self.orig_forward_attrs_list
self.forward_returns_str = "" #self.orig_forward_returns_list
#self.forward_inputs_position_map
# Raw Data #self.forward_outputs_position_map
self.forward_attrs_list = None #[ [attr_name, attr_type, default_value, orig_position], ...] #self.optional_inputs
self.forward_inputs_list = None #[ [arg_name, arg_type, orig_position], ...] #self.no_need_buffers
self.forward_returns_list = None #[ [ret_name, ret_type, orig_position], ...] #self.intermediate_outputs
#self.inplace_map
# Processed Data FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
self.forward_inputs_position_map = None #{ "name" : [type, fwd_position] }
self.forward_outputs_position_map = None #{ "name" : [type, fwd_position] }
# Special Op Attributes
self.optional_inputs = [] #[name, ...]
self.is_forward_only = True self.is_forward_only = True
# Generated Results # Generated Results
self.python_c_function_str = "" self.python_c_function_str = ""
self.python_c_function_reg_str = "" self.python_c_function_reg_str = ""
def CollectRawContents(self):
fwd_api_contents = self.fwd_api_contents
assert 'api' in fwd_api_contents.keys(
), "Unable to find \"api\" in fwd_api_contents keys"
assert 'args' in fwd_api_contents.keys(
), "Unable to find \"args\" in fwd_api_contents keys"
assert 'output' in fwd_api_contents.keys(
), "Unable to find \"output\" in fwd_api_contents keys"
self.forward_api_name = fwd_api_contents['api']
self.forward_args_str = fwd_api_contents['args']
self.forward_returns_str = fwd_api_contents['output']
def CollectIsForwardOnly(self): def CollectIsForwardOnly(self):
fwd_api_contents = self.fwd_api_contents forward_api_contents = self.forward_api_contents
self.is_forward_only = False if 'backward' in fwd_api_contents.keys( self.is_forward_only = False if 'backward' in forward_api_contents.keys(
) else True ) else True
def CollectOptionalInputs(self): def GeneratePythonCFunction(self):
fwd_api_contents = self.fwd_api_contents
if 'optional' in fwd_api_contents.keys():
self.optional_inputs = ParseDispensable(fwd_api_contents[
'optional'])
def CollectForwardInOutAttr(self):
forward_args_str = self.forward_args_str
forward_returns_str = self.forward_returns_str
self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward(
forward_args_str, forward_returns_str)
def CollectForwardPositionMap(self):
forward_inputs_list = self.forward_inputs_list
forward_returns_list = self.forward_returns_list
self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap(
forward_inputs_list, forward_returns_list)
def GeneratePythonCFunction(self, inplace_map):
namespace = self.namespace namespace = self.namespace
forward_api_name = GetInplacedFunctionName( inplace_map = self.inplace_map
self.forward_api_name) if inplace_map else self.forward_api_name forward_api_name = self.forward_api_name
forward_attrs_list = self.forward_attrs_list orig_forward_attrs_list = self.orig_forward_attrs_list
forward_inputs_position_map = self.forward_inputs_position_map forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map forward_outputs_position_map = self.forward_outputs_position_map
optional_inputs = self.optional_inputs optional_inputs = self.optional_inputs
...@@ -326,7 +294,7 @@ class PythonCSingleFunctionGenerator: ...@@ -326,7 +294,7 @@ class PythonCSingleFunctionGenerator:
parse_attributes_str = "" parse_attributes_str = ""
# Generate Python-C Attributes Parsing Logic # Generate Python-C Attributes Parsing Logic
for name, atype, _, pos in forward_attrs_list: for name, atype, _, pos in orig_forward_attrs_list:
parsing_function_name = FindParsingFunctionFromAttributeType(atype) parsing_function_name = FindParsingFunctionFromAttributeType(atype)
parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format( parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format(
name, pos, atype, name, parsing_function_name, name, name, pos, atype, name, parsing_function_name, name,
...@@ -334,11 +302,11 @@ class PythonCSingleFunctionGenerator: ...@@ -334,11 +302,11 @@ class PythonCSingleFunctionGenerator:
# Generate Dygraph Function Call Logic # Generate Dygraph Function Call Logic
num_args = len(forward_inputs_position_map.keys()) + len( num_args = len(forward_inputs_position_map.keys()) + len(
forward_attrs_list) orig_forward_attrs_list)
dygraph_function_call_list = ["" for i in range(num_args)] dygraph_function_call_list = ["" for i in range(num_args)]
for name, (_, pos) in forward_inputs_position_map.items(): for name, (_, pos) in forward_inputs_position_map.items():
dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_list[pos] = f"{name}"
for name, _, _, pos in forward_attrs_list: for name, _, _, pos in orig_forward_attrs_list:
dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_list[pos] = f"{name}"
dygraph_function_call_str = ",".join(dygraph_function_call_list) dygraph_function_call_str = ",".join(dygraph_function_call_list)
...@@ -350,17 +318,7 @@ class PythonCSingleFunctionGenerator: ...@@ -350,17 +318,7 @@ class PythonCSingleFunctionGenerator:
fwd_function_name = FUNCTION_NAME_TEMPLATE.format( fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
"::", namespace, GetForwardFunctionName(forward_api_name)) "::", namespace, GetForwardFunctionName(forward_api_name))
if inplace_map: return_str = " return ToPyObject(out);"
assert len(
inplace_map
) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
for inplace_input, inplace_output in inplace_map.items():
return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
forward_api_name, inplace_input, forward_api_name,
inplace_output)
break
else:
return_str = " return ToPyObject(out);"
# Generate Record Event for performance profiling # Generate Record Event for performance profiling
pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format( pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format(
...@@ -374,29 +332,56 @@ class PythonCSingleFunctionGenerator: ...@@ -374,29 +332,56 @@ class PythonCSingleFunctionGenerator:
self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format( self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
forward_api_name, namespace, forward_api_name, forward_api_name) forward_api_name, namespace, forward_api_name, forward_api_name)
def run(self, inplace_map): if len(inplace_map) > 0:
inplaced_forward_api_name = GetInplacedFunctionName(
self.forward_api_name)
assert len(
inplace_map
) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
for inplace_input, inplace_output in inplace_map.items():
return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
inplaced_forward_api_name, inplace_input,
inplaced_forward_api_name, inplace_output)
break
self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format(
inplaced_forward_api_name, pythonc_record_event_str,
inplaced_forward_api_name, get_eager_tensor_str,
parse_attributes_str, fwd_function_name,
dygraph_function_call_str, return_str)
# Generate Python-C Function Registration
self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format(
inplaced_forward_api_name, namespace, inplaced_forward_api_name,
inplaced_forward_api_name)
def run(self):
# Initialized is_forward_only # Initialized is_forward_only
self.CollectIsForwardOnly() self.CollectIsForwardOnly()
# Initialized forward_api_name, forward_args_str, forward_returns_str
self.CollectRawContents()
if SkipAPIGeneration(self.forward_api_name): return False
# Initialized optional_inputs # Initialized optional_inputs
self.CollectOptionalInputs() self.ParseDispensable()
# Initialized inplace_map
self.ParseInplaceInfo()
# Initialized forward_inputs_list, forward_returns_list, forward_attrs_list # Initialized orig_forward_inputs_list, orig_forward_returns_list, orig_forward_attrs_list
self.CollectForwardInOutAttr() self.CollectOriginalForwardInfo()
logging.info( logging.info(
f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}") f"Parsed Original Forward Inputs List: \n{self.orig_forward_inputs_list}"
)
logging.info( logging.info(
f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}") f"Prased Original Forward Attrs List: \n{self.orig_forward_attrs_list}"
)
logging.info( logging.info(
f"Parsed Original Forward Returns List: \n{self.forward_returns_list}" f"Parsed Original Forward Returns List: \n{self.orig_forward_returns_list}"
) )
if SkipAPIGeneration(self.forward_api_name): return False
# Initialized forward_inputs_position_map, forward_outputs_position_map # Initialized forward_inputs_position_map, forward_outputs_position_map
self.CollectForwardPositionMap() self.DetermineForwardPositionMap(self.orig_forward_inputs_list,
self.orig_forward_returns_list)
logging.info( logging.info(
f"Generated Forward Input Position Map: {self.forward_inputs_position_map}" f"Generated Forward Input Position Map: {self.forward_inputs_position_map}"
) )
...@@ -405,7 +390,7 @@ class PythonCSingleFunctionGenerator: ...@@ -405,7 +390,7 @@ class PythonCSingleFunctionGenerator:
) )
# Code Generation # Code Generation
self.GeneratePythonCFunction(inplace_map) self.GeneratePythonCFunction()
logging.info( logging.info(
f"Generated Python-C Function: {self.python_c_function_str}") f"Generated Python-C Function: {self.python_c_function_str}")
logging.info( logging.info(
...@@ -415,21 +400,18 @@ class PythonCSingleFunctionGenerator: ...@@ -415,21 +400,18 @@ class PythonCSingleFunctionGenerator:
return True return True
class PythonCYamlGenerator: class PythonCYamlGenerator(YamlGeneratorBase):
def __init__(self, path): def __init__(self, path):
self.yaml_path = path # Parent members:
# self.namespace
self.namespace = "" # self.api_yaml_path
self.forward_api_list = [] # self.forward_api_list
YamlGeneratorBase.__init__(self, api_yaml_path)
# Generated Result # Generated Result
self.python_c_functions_reg_str = "" self.python_c_functions_reg_str = ""
self.python_c_functions_str = "" self.python_c_functions_str = ""
def ParseYamlContents(self):
yaml_path = self.yaml_path
self.forward_api_list = ReadFwdFile(yaml_path)
def GeneratePythonCFunctions(self): def GeneratePythonCFunctions(self):
namespace = self.namespace namespace = self.namespace
forward_api_list = self.forward_api_list forward_api_list = self.forward_api_list
...@@ -437,28 +419,12 @@ class PythonCYamlGenerator: ...@@ -437,28 +419,12 @@ class PythonCYamlGenerator:
for forward_api_content in forward_api_list: for forward_api_content in forward_api_list:
f_generator = PythonCSingleFunctionGenerator(forward_api_content, f_generator = PythonCSingleFunctionGenerator(forward_api_content,
namespace) namespace)
status = f_generator.run({}) status = f_generator.run()
if status == True: if status == True:
self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n" self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
self.python_c_functions_str += f_generator.python_c_function_str + "\n" self.python_c_functions_str += f_generator.python_c_function_str + "\n"
if 'inplace' in forward_api_content.keys():
inplace_map = ParseInplaceInfo(forward_api_content['inplace'])
f_generator_inplace = PythonCSingleFunctionGenerator(
forward_api_content, namespace)
status = f_generator_inplace.run(inplace_map)
if status == True:
self.python_c_functions_reg_str += f_generator_inplace.python_c_function_reg_str + ",\n"
self.python_c_functions_str += f_generator_inplace.python_c_function_str + "\n"
def InferNameSpace(self):
yaml_path = self.yaml_path
if "sparse" in yaml_path:
self.namespace = "sparse::"
def AttachNamespace(self): def AttachNamespace(self):
namespace = self.namespace namespace = self.namespace
python_c_functions_str = self.python_c_functions_str python_c_functions_str = self.python_c_functions_str
...@@ -474,7 +440,7 @@ class PythonCYamlGenerator: ...@@ -474,7 +440,7 @@ class PythonCYamlGenerator:
self.InferNameSpace() self.InferNameSpace()
# Read Yaml file # Read Yaml file
self.ParseYamlContents() self.ParseForwardYamlContents()
# Code Generation # Code Generation
self.GeneratePythonCFunctions() self.GeneratePythonCFunctions()
......
...@@ -51,13 +51,12 @@ static std::vector<std::string> GetTensorsName( ...@@ -51,13 +51,12 @@ static std::vector<std::string> GetTensorsName(
} }
static void CheckInputVarStatus(const Tensor &tensor) { static void CheckInputVarStatus(const Tensor &tensor) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(tensor.defined() && tensor.is_dense_tensor(), true,
tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true, paddle::platform::errors::InvalidArgument(
paddle::platform::errors::InvalidArgument( "The input tensor %s of "
"The input tensor %s of " "RunProgram(Grad)Op holds "
"RunProgram(Grad)Op holds " "wrong type. Expect type is DenseTensor.",
"wrong type. Expect type is DenseTensor.", tensor.name()));
tensor.name()));
PADDLE_ENFORCE_EQ(tensor.initialized(), true, PADDLE_ENFORCE_EQ(tensor.initialized(), true,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
...@@ -74,7 +73,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, ...@@ -74,7 +73,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"dst_tensor shall be defined.")); "dst_tensor shall be defined."));
if (phi::DenseTensor::classof(dst_tensor.impl().get())) { if (dst_tensor.is_dense_tensor()) {
auto &src_tensor = src_var.Get<phi::DenseTensor>(); auto &src_tensor = src_var.Get<phi::DenseTensor>();
PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true, PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
...@@ -88,7 +87,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, ...@@ -88,7 +87,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
"RunProgram(Grad)Op's internal " "RunProgram(Grad)Op's internal "
"scope is not initialized.", "scope is not initialized.",
name)); name));
} else if (phi::SelectedRows::classof(dst_tensor.impl().get())) { } else if (dst_tensor.is_selected_rows()) {
auto &src_tensor = src_var.Get<phi::SelectedRows>(); auto &src_tensor = src_var.Get<phi::SelectedRows>();
PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true, PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
...@@ -159,9 +158,6 @@ static void ShareTensorsFromScope( ...@@ -159,9 +158,6 @@ static void ShareTensorsFromScope(
name)); name));
CheckOutputVarStatus(*var, *tensors[i]); CheckOutputVarStatus(*var, *tensors[i]);
// share tensor // share tensor
// TODO(dev): Determine Tensor type by scope.var
// auto tensor_base = tensors[i]->impl();
// if (phi::DenseTensor::classof(tensor_base.get())) {
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
auto &src_tensor = var->Get<phi::DenseTensor>(); auto &src_tensor = var->Get<phi::DenseTensor>();
auto *dst_tensor = const_cast<phi::DenseTensor *>( auto *dst_tensor = const_cast<phi::DenseTensor *>(
...@@ -169,7 +165,6 @@ static void ShareTensorsFromScope( ...@@ -169,7 +165,6 @@ static void ShareTensorsFromScope(
VLOG(2) << "share " << name << " from scope"; VLOG(2) << "share " << name << " from scope";
*dst_tensor = src_tensor; *dst_tensor = src_tensor;
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
// } else if (phi::SelectedRows::classof(tensor_base.get())) {
auto &src_tensor = var->Get<phi::SelectedRows>(); auto &src_tensor = var->Get<phi::SelectedRows>();
auto *dst_tensor = const_cast<phi::SelectedRows *>( auto *dst_tensor = const_cast<phi::SelectedRows *>(
dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get())); dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
...@@ -202,7 +197,6 @@ inline void RunProgramAPI( ...@@ -202,7 +197,6 @@ inline void RunProgramAPI(
"The OutScope of RunProgramGradOp should only hold one scope.")); "The OutScope of RunProgramGradOp should only hold one scope."));
// Step 2. prepare executor and init persistable variables // Step 2. prepare executor and init persistable variables
// NOTE(Aurelius84): While training some models, forward can be called many // NOTE(Aurelius84): While training some models, forward can be called many
// times and then apply backpropagation all at once, such as Reinforcement // times and then apply backpropagation all at once, such as Reinforcement
// Learning. Tensor data in multi-step training should be saved into single // Learning. Tensor data in multi-step training should be saved into single
...@@ -277,11 +271,6 @@ inline void RunProgramGradAPI( ...@@ -277,11 +271,6 @@ inline void RunProgramGradAPI(
// if all output vars are set to stop_gradient, grad op no need to executed // if all output vars are set to stop_gradient, grad op no need to executed
if (x_grad.empty() && params_grad.empty()) return; if (x_grad.empty() && params_grad.empty()) return;
// TODO(dev): Remove this line hard code. And need to deal with the out_grad
// name problem.
// const_cast<paddle::experimental::Tensor &>(out_grad[0])
// .set_name("matmul_v2_0.tmp_0@GRAD");
auto *global_block = auto *global_block =
BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
...@@ -381,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -381,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
VLOG(3) << "out_grads[0].size() : " << grads[0].size(); VLOG(3) << "out_grads[0].size() : " << grads[0].size();
std::vector<paddle::experimental::Tensor> x_grad; std::vector<paddle::experimental::Tensor> x_grad;
std::vector<paddle::experimental::Tensor> params_grad; std::vector<paddle::experimental::Tensor> params_grad;
ConstructGradTensors(x_, &x_grad); ConstructXGradTensors(x_, &x_grad);
ConstructGradTensors(params_, &params_grad); ConstructParamGradTensors(params_, &params_grad);
std::vector<paddle::experimental::Tensor *> x_grad_ptr; std::vector<paddle::experimental::Tensor *> x_grad_ptr;
std::vector<paddle::experimental::Tensor *> params_grad_ptr; std::vector<paddle::experimental::Tensor *> params_grad_ptr;
for (auto &i : x_grad) { for (auto &i : x_grad) {
...@@ -392,9 +381,6 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -392,9 +381,6 @@ class GradNodeRunProgram : public egr::GradNodeBase {
params_grad_ptr.emplace_back(&i); params_grad_ptr.emplace_back(&i);
} }
// auto x_grad_ptr = ConstructGradTensors(x_);
// auto params_grad_ptr = ConstructGradTensors(params_);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
grads[0].size(), fwd_out_names_.size(), grads[0].size(), fwd_out_names_.size(),
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
...@@ -412,7 +398,6 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -412,7 +398,6 @@ class GradNodeRunProgram : public egr::GradNodeBase {
params_grad_ptr); params_grad_ptr);
VLOG(3) << "End Eager Backward Node: GradNodeRunProgram"; VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
return {x_grad, params_grad}; return {x_grad, params_grad};
// return {x_grad, details::DereferenceTensors(params_grad_ptr)};
} }
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
...@@ -447,29 +432,35 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -447,29 +432,35 @@ class GradNodeRunProgram : public egr::GradNodeBase {
} }
protected: protected:
void ConstructGradTensors( void ConstructXGradTensors(
const std::vector<paddle::experimental::Tensor> &fwd_tensors, const std::vector<paddle::experimental::Tensor> &x,
std::vector<paddle::experimental::Tensor> *grad_tensors) { std::vector<paddle::experimental::Tensor> *x_grad) {
// TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
// such as: name, tensor type(DenseTensor or SelectedRows). // such as: name, tensor type(DenseTensor or SelectedRows).
VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); for (auto &t : x) {
for (auto &fwd_t : fwd_tensors) { if (t.is_dense_tensor()) {
if (phi::DenseTensor::classof(fwd_t.impl().get())) { x_grad->emplace_back(std::make_shared<phi::DenseTensor>());
grad_tensors->emplace_back(std::make_shared<phi::DenseTensor>()); } else if (t.is_selected_rows()) {
} else if (phi::SelectedRows::classof(fwd_t.impl().get())) { x_grad->emplace_back(std::make_shared<phi::SelectedRows>());
grad_tensors->emplace_back(std::make_shared<phi::SelectedRows>());
} }
auto &grad_t = grad_tensors->back(); x_grad->back().set_name(t.name() + "@GRAD");
grad_t.set_name(fwd_t.name() + "@GRAD");
} }
} }
void ConstructGradTensors( void ConstructParamGradTensors(
const std::vector<paddle::experimental::Tensor> &fwd_tensors) { const std::vector<paddle::experimental::Tensor> &param,
VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); std::vector<paddle::experimental::Tensor> *param_grad) {
for (auto &fwd_t : fwd_tensors) { for (auto &t : param) {
auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad(); auto t_meta = egr::EagerUtils::unsafe_autograd_meta(t);
grad_tesnor.set_name(fwd_t.name() + "@GRAD"); auto t_grad = egr::EagerUtils::unsafe_autograd_meta(t)->Grad();
if (t_meta->StopGradient()) {
param_grad->emplace_back();
} else if (t_grad.is_dense_tensor()) {
param_grad->emplace_back(std::make_shared<phi::DenseTensor>());
} else if (t_grad.is_selected_rows()) {
param_grad->emplace_back(std::make_shared<phi::SelectedRows>());
}
param_grad->back().set_name(t.name() + "@GRAD");
} }
} }
......
...@@ -271,6 +271,7 @@ void EagerUtils::GetOutput(const std::shared_ptr<EagerVariable>& out, ...@@ -271,6 +271,7 @@ void EagerUtils::GetOutput(const std::shared_ptr<EagerVariable>& out,
"shared_ptr, this error may indicate some outputs " "shared_ptr, this error may indicate some outputs "
"are nullptr")); "are nullptr"));
out_var->set_impl(out->GetTensorBase()); out_var->set_impl(out->GetTensorBase());
out_var->set_name(out->name());
} }
void EagerUtils::GetOutputs( void EagerUtils::GetOutputs(
......
...@@ -13,6 +13,9 @@ IF(WITH_GPU) ...@@ -13,6 +13,9 @@ IF(WITH_GPU)
nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps)
#nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS graph_gpu_ps)
# ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu)
# target_link_libraries(test_sample_rate graph_gpu_ps)
ENDIF() ENDIF()
IF(WITH_ROCM) IF(WITH_ROCM)
hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
......
...@@ -93,14 +93,17 @@ node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 ...@@ -93,14 +93,17 @@ node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
struct NeighborSampleResult { struct NeighborSampleResult {
int64_t *val; int64_t *val;
int *actual_sample_size, sample_size, key_size; int *actual_sample_size, sample_size, key_size;
int *offset;
NeighborSampleResult(int _sample_size, int _key_size) NeighborSampleResult(int _sample_size, int _key_size)
: sample_size(_sample_size), key_size(_key_size) { : sample_size(_sample_size), key_size(_key_size) {
actual_sample_size = NULL; actual_sample_size = NULL;
val = NULL; val = NULL;
offset = NULL;
}; };
~NeighborSampleResult() { ~NeighborSampleResult() {
if (val != NULL) cudaFree(val); if (val != NULL) cudaFree(val);
if (actual_sample_size != NULL) cudaFree(actual_sample_size); if (actual_sample_size != NULL) cudaFree(actual_sample_size);
if (offset != NULL) cudaFree(offset);
} }
}; };
......
...@@ -71,10 +71,10 @@ TEST(TEST_FLEET, graph_sample) { ...@@ -71,10 +71,10 @@ TEST(TEST_FLEET, graph_sample) {
*/ */
::paddle::distributed::GraphParameter table_proto; ::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(true); table_proto.set_gpups_mode(true);
table_proto.set_gpups_mode_shard_num(127); table_proto.set_shard_num(127);
table_proto.set_gpu_num(3); table_proto.set_gpu_num(3);
table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler"); table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
table_proto.set_gpups_graph_sample_args("5,5,1,1"); table_proto.set_gpups_graph_sample_args("100,5,5,1,1");
prepare_file(edge_file_name, edges); prepare_file(edge_file_name, edges);
g.init_cpu_table(table_proto); g.init_cpu_table(table_proto);
g.load(std::string(edge_file_name), std::string("e>")); g.load(std::string(edge_file_name), std::string("e>"));
...@@ -93,16 +93,53 @@ TEST(TEST_FLEET, graph_sample) { ...@@ -93,16 +93,53 @@ TEST(TEST_FLEET, graph_sample) {
cudaMalloc((void **)&key, 3 * sizeof(int64_t)); cudaMalloc((void **)&key, 3 * sizeof(int64_t));
cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
int64_t *res = new int64_t[9]; int64_t *res = new int64_t[7];
cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); /*
cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost);
std::sort(res, res + 3); std::sort(res, res + 3);
std::sort(res + 6, res + 9); std::sort(res + 4, res + 7);
int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; //int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
for (int i = 0; i < 9; i++) { int64_t expected_sample_val[] = {28, 29, 30, 0, 21, 22, 23};
for (int i = 0; i < 7; i++) {
VLOG(0)<<i<<" "<<res[i];
if (expected_sample_val[i] != -1) { if (expected_sample_val[i] != -1) {
ASSERT_EQ(res[i], expected_sample_val[i]); ASSERT_EQ(res[i], expected_sample_val[i]);
} }
} }
delete[] res; delete[] res;
delete neighbor_sample_res; delete neighbor_sample_res;
*/
cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost);
int *actual_sample_size = new int[3];
cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size, 12,
cudaMemcpyDeviceToHost); // 3, 1, 3
int *cumsum_sample_size = new int[3];
cudaMemcpy(cumsum_sample_size, neighbor_sample_res->offset, 12,
cudaMemcpyDeviceToHost); // 0, 3, 4
std::vector<std::vector<int64_t>> neighbors_;
std::vector<int64_t> neighbors_7 = {28, 29, 30, 31, 32, 33, 34, 35};
std::vector<int64_t> neighbors_0 = {0};
std::vector<int64_t> neighbors_6 = {21, 22, 23, 24, 25, 26, 27};
neighbors_.push_back(neighbors_7);
neighbors_.push_back(neighbors_0);
neighbors_.push_back(neighbors_6);
for (int i = 0; i < 3; i++) {
for (int j = cumsum_sample_size[i];
j < cumsum_sample_size[i] + actual_sample_size[i]; j++) {
bool flag = false;
for (int k = 0; k < neighbors_[i].size(); k++) {
if (res[j] == neighbors_[i][k]) {
flag = true;
break;
}
}
ASSERT_EQ(flag, true);
}
}
delete[] res;
delete[] actual_sample_size;
delete[] cumsum_sample_size;
delete neighbor_sample_res;
} }
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <unistd.h>
#include <condition_variable> // NOLINT
#include <fstream>
#include <iomanip>
#include <string>
#include <thread> // NOLINT
#include <unordered_set>
#include <vector>
#include "google/protobuf/text_format.h"
#include <chrono>
#include "gtest/gtest.h"
#include "paddle/fluid/distributed/ps.pb.h"
#include "paddle/fluid/distributed/ps/service/env.h"
#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
using namespace paddle::framework;
namespace platform = paddle::platform;
namespace operators = paddle::operators;
namespace memory = paddle::memory;
namespace distributed = paddle::distributed;
std::string input_file;
int fixed_key_size = 100, sample_size = 100,
bfs_sample_nodes_in_each_shard = 10000, init_search_size = 1,
bfs_sample_edges = 20;
std::vector<std::string> edges = {
std::string("37\t45\t0.34"), std::string("37\t145\t0.31"),
std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
std::string("59\t45\t0.34"), std::string("59\t145\t0.31"),
std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
// odd id:96 48 122 112
char edge_file_name[] = "test_edges.txt";
void prepare_file(char file_name[], std::vector<std::string> data) {
std::ofstream ofile;
ofile.open(file_name);
for (auto x : data) {
ofile << x << std::endl;
}
ofile.close();
}
void testSampleRate() {
#ifdef PADDLE_WITH_HETERPS
std::vector<int64_t> ids;
int start = 0;
pthread_rwlock_t rwlock;
pthread_rwlock_init(&rwlock, NULL);
{
::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(false);
table_proto.set_shard_num(127);
table_proto.set_task_pool_size(24);
std::cerr << "initializing begin";
distributed::GraphTable graph_table;
graph_table.initialize(table_proto);
std::cerr << "initializing done";
graph_table.load(input_file, std::string("e>"));
int sample_actual_size = -1;
int step = fixed_key_size, cur = 0;
while (sample_actual_size != 0) {
std::unique_ptr<char[]> buffer;
graph_table.pull_graph_list(cur, step, buffer, sample_actual_size, false,
1);
int index = 0;
while (index < sample_actual_size) {
paddle::distributed::FeatureNode node;
node.recover_from_buffer(buffer.get() + index);
index += node.get_size(false);
// res.push_back(node);
ids.push_back(node.get_id());
int swap_pos = rand() % ids.size();
std::swap(ids[swap_pos], ids[(int)ids.size() - 1]);
}
cur = ids.size();
// if (sample_actual_size == 0) break;
// char *buff = buffer.get();
// for (int i = 0; i < sample_actual_size/sizeof(int64_t); i++) {
// ids.push_back(*((int64_t *)buff + i));
// int swap_pos = rand() % ids.size();
// std::swap(ids[swap_pos], ids[(int)ids.size() - 1]);
// }
// cur += sample_actual_size/sizeof(int64_t);
}
std::cerr << "load ids done" << std::endl;
std::vector<int64_t> sample_id[10], sample_neighbors[10];
std::vector<int> actual_size[10];
auto func = [&rwlock, &graph_table, &ids, &sample_id, &actual_size,
&sample_neighbors, &start](int i) {
while (true) {
int s, sn;
bool exit = false;
pthread_rwlock_wrlock(&rwlock);
if (start < ids.size()) {
s = start;
sn = ids.size() - start;
sn = min(sn, fixed_key_size);
start += sn;
} else {
exit = true;
}
pthread_rwlock_unlock(&rwlock);
if (exit) break;
std::vector<std::shared_ptr<char>> buffers(sn);
std::vector<int> ac(sn);
auto status = graph_table.random_sample_neighbors(
ids.data() + s, sample_size, buffers, ac, false);
for (int j = s; j < s + sn; j++) {
sample_id[i].push_back(ids[j]);
actual_size[i].push_back(ac[j - s] / sizeof(int64_t));
int ss = ac[j - s] / sizeof(int64_t);
for (int k = 0; k < ss; k++) {
sample_neighbors[i].push_back(
*((int64_t *)(buffers[j - s].get() + k * sizeof(int64_t))));
}
}
}
VLOG(0) << "func " << i << " returns ";
};
auto start1 = std::chrono::steady_clock::now();
std::thread thr[10];
for (int i = 0; i < 10; i++) {
thr[i] = std::thread(func, i);
}
for (int i = 0; i < 10; i++) thr[i].join();
auto end1 = std::chrono::steady_clock::now();
auto tt =
std::chrono::duration_cast<std::chrono::microseconds>(end1 - start1);
std::cerr << "total time cost without cache is " << tt.count() << " us"
<< std::endl;
}
const int gpu_num = 8;
::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(true);
table_proto.set_shard_num(127);
table_proto.set_gpu_num(gpu_num);
table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
table_proto.set_gpups_graph_sample_args(std::to_string(init_search_size) +
",100000000,10000000,1,1");
std::vector<int> dev_ids;
for (int i = 0; i < gpu_num; i++) {
dev_ids.push_back(i);
}
std::shared_ptr<HeterPsResource> resource =
std::make_shared<HeterPsResource>(dev_ids);
resource->enable_p2p();
GpuPsGraphTable g(resource);
g.init_cpu_table(table_proto);
g.load(std::string(input_file), std::string("e>"));
NodeQueryResult *query_node_res;
query_node_res = g.query_node_list(0, 0, ids.size() + 10000);
VLOG(0) << "gpu got " << query_node_res->actual_sample_size << " nodes ";
VLOG(0) << "cpu got " << ids.size() << " nodes";
ASSERT_EQ((int)query_node_res->actual_sample_size, (int)ids.size());
int64_t *gpu_node_res = new int64_t[ids.size()];
cudaMemcpy(gpu_node_res, query_node_res->val, ids.size() * sizeof(int64_t),
cudaMemcpyDeviceToHost);
std::unordered_set<int64_t> cpu_node_set, gpu_node_set;
for (auto x : ids) {
cpu_node_set.insert(x);
}
for (int i = 0; i < (int)query_node_res->actual_sample_size; i++) {
auto x = gpu_node_res[i];
ASSERT_EQ(cpu_node_set.find(x) != cpu_node_set.end(), true);
gpu_node_set.insert(x);
}
VLOG(0) << " cpu_node_size = " << cpu_node_set.size();
VLOG(0) << " gpu_node_size = " << gpu_node_set.size();
ASSERT_EQ(cpu_node_set.size(), gpu_node_set.size());
for (int i = 0; i < 20; i++) {
int st = ids.size() / 20 * i;
auto q = g.query_node_list(0, st, ids.size() / 20);
VLOG(0) << " the " << i << "th iteration size = " << q->actual_sample_size;
}
// NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
/*
void *key;
cudaMalloc((void **)&key, ids.size() * sizeof(int64_t));
cudaMemcpy(key, ids.data(), ids.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
std::vector<NeighborSampleResult *> res[gpu_num];
start = 0;
auto func = [&rwlock, &g, &res, &start,
&gpu_num, &ids, &key](int i) {
while (true) {
int s, sn;
bool exit = false;
pthread_rwlock_wrlock(&rwlock);
if (start < ids.size()) {
s = start;
sn = ids.size() - start;
sn = min(sn, fixed_key_size);
start += sn;
} else {
exit = true;
}
pthread_rwlock_unlock(&rwlock);
if (exit) break;
auto r =
g.graph_neighbor_sample(i, (int64_t *)(key + s), sample_size, sn);
res[i].push_back(r);
}
};
auto start1 = std::chrono::steady_clock::now();
std::thread thr[gpu_num];
for (int i = 0; i < gpu_num; i++) {
thr[i] = std::thread(func, i);
}
for (int i = 0; i < gpu_num; i++) thr[i].join();
auto end1 = std::chrono::steady_clock::now();
auto tt =
std::chrono::duration_cast<std::chrono::microseconds>(end1 - start1);
std::cerr << "total time cost without cache is " << tt.count() << " us"
<< std::endl;
*/
#endif
}
// TEST(testSampleRate, Run) { testSampleRate(); }
int main(int argc, char *argv[]) {
for (int i = 0; i < argc; i++)
VLOG(0) << "Argument " << i << " is " << std::string(argv[i]);
if (argc > 1) {
input_file = argv[1];
} else {
prepare_file(edge_file_name, edges);
input_file = edge_file_name;
}
VLOG(0) << "input_file is " << input_file;
if (argc > 2) {
fixed_key_size = std::stoi(argv[2]);
}
VLOG(0) << "sample_node_size for every batch is " << fixed_key_size;
if (argc > 3) {
sample_size = std::stoi(argv[3]);
}
VLOG(0) << "sample_size neighbor_size is " << sample_size;
if (argc > 4) init_search_size = std::stoi(argv[4]);
VLOG(0) << " init_search_size " << init_search_size;
testSampleRate();
}
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP(cinn_launch); USE_OP(cinn_launch);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
namespace paddle::framework { namespace paddle::framework {
......
...@@ -234,10 +234,26 @@ void InterpreterCore::Convert( ...@@ -234,10 +234,26 @@ void InterpreterCore::Convert(
gc_check_input_list.erase(last, gc_check_input_list.end()); gc_check_input_list.erase(last, gc_check_input_list.end());
for (auto var_id : gc_check_input_list) { for (auto var_id : gc_check_input_list) {
vec_meta_info[var_id].var_ref_count_++; paddle::framework::Variable* var = global_scope_->Var(var_id);
instr.AddGCCheckVar(var_id); if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after " var->IsType<LoDTensorArray>()) {
<< instr.OpBase()->Type(); vec_meta_info[var_id].var_ref_count_++;
// TODO(zhiqiu): not all var needs to be checked, var need to be checked
// only
// after the last_live_op. For example,
// b = op1(a)
// c = op2(a, b)
// in this case, a is the input of op1 and op2, we only need to check
// a after op2, because op2 always uses a after op1.
instr.AddGCCheckVar(var_id);
VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after "
<< instr.OpBase()->Type();
} else {
VLOG(4) << "not clear " << global_scope_->GetNameById(var_id)
<< " after " << instr.OpBase()->Type()
<< " because its type is "
<< framework::ToTypeName(var->Type());
}
} }
} }
......
...@@ -674,7 +674,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { ...@@ -674,7 +674,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
} // namespace paddle } // namespace paddle
USE_PASS(build_cinn_pass); USE_PASS(build_cinn_pass);
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP_ITSELF(relu); USE_OP_ITSELF(relu);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(relu_grad); USE_OP_ITSELF(relu_grad);
......
...@@ -300,6 +300,6 @@ TEST(CinnCompilerTest, Compile) { ...@@ -300,6 +300,6 @@ TEST(CinnCompilerTest, Compile) {
USE_PASS(build_cinn_pass); USE_PASS(build_cinn_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP_ITSELF(relu); USE_OP_ITSELF(relu);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
...@@ -98,4 +98,4 @@ TEST(test_var_helper, eager_var_helper) { ...@@ -98,4 +98,4 @@ TEST(test_var_helper, eager_var_helper) {
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
...@@ -28,6 +28,8 @@ ...@@ -28,6 +28,8 @@
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
namespace platform = paddle::platform; namespace platform = paddle::platform;
namespace framework = paddle::framework; namespace framework = paddle::framework;
...@@ -267,7 +269,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) { ...@@ -267,7 +269,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) {
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP(mul_grad); USE_OP_ITSELF(mul_grad);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(elementwise_add_grad); USE_OP_ITSELF(elementwise_add_grad);
...@@ -416,4 +416,4 @@ TEST(test_layer, test_eager) { ...@@ -416,4 +416,4 @@ TEST(test_layer, test_eager) {
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
...@@ -34,9 +34,13 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); ...@@ -34,9 +34,13 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten_grad, GPU, ALL_LAYOUT);
#endif #endif
namespace imperative = paddle::imperative; namespace imperative = paddle::imperative;
...@@ -598,8 +602,8 @@ TEST(test_tracer, eager_tracer) { ...@@ -598,8 +602,8 @@ TEST(test_tracer, eager_tracer) {
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP(mul_grad); USE_OP_ITSELF(mul_grad);
USE_OP_ITSELF(reduce_sum); USE_OP_ITSELF(reduce_sum);
USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
...@@ -43,4 +43,4 @@ TEST(fc_op, test) { ...@@ -43,4 +43,4 @@ TEST(fc_op, test) {
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
...@@ -46,4 +46,4 @@ TEST(MulOpConverter, main) { ...@@ -46,4 +46,4 @@ TEST(MulOpConverter, main) {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
...@@ -65,9 +65,10 @@ class MeanCUDAKernel : public framework::OpKernel<T> { ...@@ -65,9 +65,10 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
for (decltype(rank) i = 0; i < rank; ++i) { for (decltype(rank) i = 0; i < rank; ++i) {
reduce_dims.push_back(i); reduce_dims.push_back(i);
} }
TensorReduceImpl<T, T, kernel_primitives::AddFunctor, Div>( TensorReduceImpl<T, T, kernel_primitives::AddFunctor,
context.cuda_device_context(), *input, output, Div(numel), reduce_dims, kps::IdentityFunctor<T>>(
stream); context.cuda_device_context(), *input, output,
kps::IdentityFunctor<T>(), reduce_dims, stream, true);
} }
}; };
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/mul_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/platform/mkldnn_reuse.h"
namespace phi { namespace phi {
...@@ -46,6 +46,9 @@ using dnnl::memory; ...@@ -46,6 +46,9 @@ using dnnl::memory;
using dnnl::prop_kind; using dnnl::prop_kind;
using dnnl::stream; using dnnl::stream;
constexpr int kMULMKLDNNINT8 = 1;
constexpr int kMULMKLDNNFP32 = 2;
template <typename XT, typename YT, typename OT> template <typename XT, typename YT, typename OT>
class MulPrimitiveFactory { class MulPrimitiveFactory {
public: public:
......
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/mul_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
...@@ -27,6 +27,9 @@ namespace operators { ...@@ -27,6 +27,9 @@ namespace operators {
using framework::OpKernelType; using framework::OpKernelType;
using framework::Tensor; using framework::Tensor;
constexpr int kMULMKLDNNINT8 = 1;
constexpr int kMULMKLDNNFP32 = 2;
class MulOp : public framework::OperatorWithKernel { class MulOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -354,16 +357,3 @@ REGISTER_OPERATOR(mul_grad, ops::MulGradOp, ...@@ -354,16 +357,3 @@ REGISTER_OPERATOR(mul_grad, ops::MulGradOp,
ops::MulDoubleGradMaker<paddle::imperative::OpBase>); ops::MulDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp); REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp);
REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
mul_grad_grad,
ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, double>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>,
ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
mul_grad_grad,
ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
constexpr int kMULMKLDNNINT8 = 1;
constexpr int kMULMKLDNNFP32 = 2;
template <typename DeviceContext, typename T>
class MulKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* x = context.Input<Tensor>("X");
const Tensor* y = context.Input<Tensor>("Y");
Tensor* z = context.Output<Tensor>("Out");
const Tensor x_matrix =
x->dims().size() > 2
? framework::ReshapeToMatrix(
*x, context.template Attr<int>("x_num_col_dims"))
: *x;
const Tensor y_matrix =
y->dims().size() > 2
? framework::ReshapeToMatrix(
*y, context.template Attr<int>("y_num_col_dims"))
: *y;
z->mutable_data<T>(context.GetPlace());
auto z_dim = z->dims();
if (z_dim.size() != 2) {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
blas.MatMul(x_matrix, y_matrix, z);
if (z_dim.size() != 2) {
z->Resize(z_dim);
}
}
};
template <typename DeviceContext, typename T>
class MulGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto x_matrix = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, x_num_col_dims)
: static_cast<const Tensor&>(*x);
auto y_matrix = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, y_num_col_dims)
: static_cast<const Tensor&>(*y);
auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
Tensor dout_mat;
dout_mat.ShareDataWith(*dout);
dout_mat.Resize({phi::flatten_to_2d(x->dims(), x_num_col_dims)[0],
phi::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
if (dx != nullptr) {
dx->set_lod(x->lod());
}
if (dy != nullptr) {
dy->set_lod(y->lod());
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_matrix = dx->dims().size() > 2
? framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
Tensor dy_matrix = dy->dims().size() > 2
? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
}
}
};
template <typename DeviceContext, typename T>
class MulDoubleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto x_mat = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, x_num_col_dims)
: static_cast<const Tensor&>(*x);
auto y_mat = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, y_num_col_dims)
: static_cast<const Tensor&>(*y);
const int m = phi::flatten_to_2d(x->dims(), x_num_col_dims)[0];
const int n = phi::flatten_to_2d(y->dims(), y_num_col_dims)[1];
auto* dout = ctx.Input<framework::LoDTensor>("DOut");
Tensor dout_mat;
dout_mat.ShareDataWith(*dout);
dout_mat.Resize({m, n});
auto* ddx = ctx.Input<framework::LoDTensor>("DDX");
auto* ddy = ctx.Input<framework::LoDTensor>("DDY");
auto* dx = ctx.Output<framework::LoDTensor>("DX");
auto* dy = ctx.Output<framework::LoDTensor>("DY");
auto* ddout = ctx.Output<framework::LoDTensor>("DDOut");
Tensor ddout_mat;
if (ddout) {
ddout->set_lod(dout->lod());
// allocate and reshape ddout
ddout->mutable_data<T>(ctx.GetPlace());
ddout_mat.ShareDataWith(*ddout);
ddout_mat.Resize({m, n});
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
// a flag to specify whether ddout value has been set, if flag
// is false, MatMul beta should be 0 to set ddout, if flag is
// true, MatMul beta should be 1 to add result to ddout.
bool ddout_flag = false;
if (ddx) {
auto ddx_mat = ddx->dims().size() > 2
? framework::ReshapeToMatrix(*ddx, x_num_col_dims)
: static_cast<const Tensor&>(*ddx);
// dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
if (dy) {
dy->set_lod(y->lod());
// allocate and reshape dy
dy->mutable_data<T>(ctx.GetPlace());
Tensor dy_mat = dy->dims().size() > 2
? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
}
// ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
if (ddout) {
blas.MatMul(ddx_mat, false, y_mat, false, static_cast<T>(1.0),
&ddout_mat, static_cast<T>(ddout_flag));
ddout_flag = true;
}
}
if (ddy) {
auto ddy_mat = ddy->dims().size() > 2
? framework::ReshapeToMatrix(*ddy, y_num_col_dims)
: static_cast<const Tensor&>(*ddy);
// dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
if (dx) {
dx->set_lod(x->lod());
// allocate and reshape dx
dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_mat = dx->dims().size() > 2
? framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
}
// ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
if (ddout) {
blas.MatMul(x_mat, false, ddy_mat, false, static_cast<T>(1.0),
&ddout_mat, static_cast<T>(ddout_flag));
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/operators/mul_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle { namespace paddle {
......
...@@ -14,11 +14,11 @@ limitations under the License. */ ...@@ -14,11 +14,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/mul_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,8 +14,13 @@ limitations under the License. */ ...@@ -14,8 +14,13 @@ limitations under the License. */
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -25,44 +30,6 @@ class MultiplexOp : public framework::OperatorWithKernel { ...@@ -25,44 +30,6 @@ class MultiplexOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "Multiplex");
PADDLE_ENFORCE_NE(
ctx->Inputs("X").empty(), true,
platform::errors::InvalidArgument("MultiInput(X) shouldn't be empty."));
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multiplex");
auto ids_dim = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(
ids_dim.size(), 2,
platform::errors::PreconditionNotMet(
"The index tensor must be a vector with 2 dimensions"));
PADDLE_ENFORCE_EQ(
ids_dim[1], 1,
platform::errors::PreconditionNotMet(
"The index tensor must be a vector with batchSize x 1."));
auto ins_dims = ctx->GetInputsDim("X");
auto num_ins = ins_dims.size();
PADDLE_ENFORCE_GT(num_ins, 1,
platform::errors::InvalidArgument(
"multiplex operator should have more than "
"one candidate input tensors."));
auto in_dim = ins_dims[0];
PADDLE_ENFORCE_GE(
in_dim.size(), 2,
platform::errors::InvalidArgument(
"The rank of candidate tensors must be not less than 2."));
for (size_t i = 1; i < num_ins; i++) {
auto dim = ins_dims[i];
PADDLE_ENFORCE_EQ(
in_dim, dim,
platform::errors::PreconditionNotMet(
"All the candidate tensors must have the same size."));
}
ctx->SetOutputDim("Out", in_dim);
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -164,8 +131,11 @@ class MultiplexGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -164,8 +131,11 @@ class MultiplexGradMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(multiplex, MultiplexInferShapeFunctor,
PD_INFER_META(phi::MultiplexInferMeta));
REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
ops::MultiplexGradMaker<paddle::framework::OpDesc>, ops::MultiplexGradMaker<paddle::framework::OpDesc>,
ops::MultiplexGradMaker<paddle::imperative::OpBase>); ops::MultiplexGradMaker<paddle::imperative::OpBase>,
MultiplexInferShapeFunctor);
REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
...@@ -21,6 +21,10 @@ ...@@ -21,6 +21,10 @@
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -29,43 +33,6 @@ using DDim = framework::DDim; ...@@ -29,43 +33,6 @@ using DDim = framework::DDim;
class QrOp : public framework::OperatorWithKernel { class QrOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "qr");
OP_INOUT_CHECK(ctx->HasOutput("Q"), "Output", "Q", "qr");
OP_INOUT_CHECK(ctx->HasOutput("R"), "Output", "R", "qr");
auto x_dims = ctx->GetInputDim("X");
int x_rank = x_dims.size();
PADDLE_ENFORCE_GE(x_dims.size(), 2,
platform::errors::InvalidArgument(
"the rank of input must greater than 2"));
bool compute_q;
bool reduced_mode;
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
std::string mode = ctx->Attrs().Get<std::string>("mode");
std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
if (compute_q) {
int k = reduced_mode ? min_mn : m;
auto q_dims_vec = phi::vectorize(x_dims);
q_dims_vec[q_dims_vec.size() - 1] = k;
ctx->SetOutputDim("Q", phi::make_ddim(q_dims_vec));
} else {
ctx->SetOutputDim("Q", phi::make_ddim({0}));
}
int k = reduced_mode ? min_mn : m;
auto r_dims_vec = phi::vectorize(x_dims);
r_dims_vec[r_dims_vec.size() - 2] = k;
r_dims_vec[r_dims_vec.size() - 1] = n;
ctx->SetOutputDim("R", phi::make_ddim(r_dims_vec));
ctx->ShareLoD("X", /*->*/ "Q");
ctx->ShareLoD("X", /*->*/ "R");
}
}; };
class QrOpMaker : public framework::OpProtoAndCheckerMaker { class QrOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -83,10 +50,8 @@ class QrOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -83,10 +50,8 @@ class QrOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault("reduced"); .SetDefault("reduced");
AddComment(R"DOC( AddComment(R"DOC(
Qr Operator. Qr Operator.
This operator is used to perform QR operation for batched matrics $X$. This operator is used to perform QR operation for batched matrics $X$.
$$Q, R = qr(X)$$ $$Q, R = qr(X)$$
)DOC"); )DOC");
} }
}; };
...@@ -138,10 +103,13 @@ class QrGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -138,10 +103,13 @@ class QrGradMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(qr, QrInferShapeFunctor,
PD_INFER_META(phi::QrInferMeta));
REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
ops::QrGradMaker<paddle::framework::OpDesc>, ops::QrGradMaker<paddle::framework::OpDesc>,
ops::QrGradMaker<paddle::imperative::OpBase>); ops::QrGradMaker<paddle::imperative::OpBase>,
QrInferShapeFunctor);
REGISTER_OPERATOR(qr_grad, ops::QrGradOp); REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
......
...@@ -33,12 +33,12 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, ...@@ -33,12 +33,12 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y, const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims, const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) { gpuStream_t stream, bool is_mean = false) {
y->mutable_data<Ty>(x.place()); y->mutable_data<Ty>(x.place());
phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>( phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>(
static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform, static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform,
origin_reduce_dims); origin_reduce_dims, is_mean);
} }
} // namespace operators } // namespace operators
......
...@@ -13,29 +13,18 @@ See the License for the specific language governing permissions and ...@@ -13,29 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <memory> #include <memory>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class TrilTriuOp : public framework::OperatorWithKernel { class TrilTriuOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(
ctx->HasInput("X"), true,
platform::errors::NotFound("Input(X) of TrilTriuOp is not found."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("Out"), true,
platform::errors::NotFound("Output(Out) of TrilTriuOp is not found."));
const auto& x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_GE(x_dims.size(), 2,
platform::errors::InvalidArgument(
"Input(X)'s rank must be at least 2 in TrilTriuOp."));
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
}; };
class TrilTriuOpMaker : public framework::OpProtoAndCheckerMaker { class TrilTriuOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -100,7 +89,10 @@ class TrilTriuGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -100,7 +89,10 @@ class TrilTriuGradOpMaker : public framework::SingleGradOpMaker<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
DECLARE_INFER_SHAPE_FUNCTOR(tril_triu, TrilTriuInferShapeFunctor,
PD_INFER_META(phi::TrilTriuInferMeta));
REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>, ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>); ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>,
TrilTriuInferShapeFunctor);
REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <iostream> #include <iostream>
#include "paddle/phi/core/enforce.h"
static PyObject *eager_api_run_program(PyObject *self, PyObject *args, static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
PyObject *kwargs) { PyObject *kwargs) {
...@@ -33,13 +34,24 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args, ...@@ -33,13 +34,24 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs); run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
PyEval_RestoreThread(tstate); PyEval_RestoreThread(tstate);
tstate = nullptr; tstate = nullptr;
Py_RETURN_NONE;
} catch (paddle::platform::EnforceNotMet &exception) {
if (tstate) {
PyEval_RestoreThread(tstate);
}
std::ostringstream sout;
sout << exception.what();
sout << " [operator < run_program > error]";
exception.set_error_str(sout.str());
ThrowExceptionToPython(std::current_exception());
return nullptr;
} catch (...) { } catch (...) {
if (tstate) { if (tstate) {
PyEval_RestoreThread(tstate); PyEval_RestoreThread(tstate);
} }
ThrowExceptionToPython(std::current_exception()); ThrowExceptionToPython(std::current_exception());
return nullptr;
} }
Py_RETURN_NONE;
} }
static PyMethodDef CustomEagerFinalStateMethods[] = { static PyMethodDef CustomEagerFinalStateMethods[] = {
......
...@@ -40,6 +40,9 @@ limitations under the License. */ ...@@ -40,6 +40,9 @@ limitations under the License. */
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
...@@ -468,6 +471,90 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, ...@@ -468,6 +471,90 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* eager_api_sparse_coo_tensor(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto non_zero_indices = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 1), 1);
auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 2), 2);
auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
PADDLE_ENFORCE(non_zero_indices.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero indices must be a DenseTensor."));
PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero elements must be a DenseTensor."));
auto dense_indices =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_indices.impl());
auto dense_elements =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_elements.impl());
// TODO(zhangkaihuo): After create SparseTensor, call coalesced() to sort and
// merge duplicate indices
std::shared_ptr<phi::SparseCooTensor> coo_tensor =
std::make_shared<phi::SparseCooTensor>(*dense_indices, *dense_elements,
phi::make_ddim(dense_shape));
paddle::experimental::Tensor tensor;
tensor.set_impl(coo_tensor);
auto name =
egr::Controller::Instance().GenerateUniqueName("generated_tensor");
tensor.set_name(name);
auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
if (!autograd_meta->GetMutableGradNode()) {
VLOG(3) << "Tensor(" << name
<< ") have not GradNode, add GradNodeAccumulation for it.";
autograd_meta->SetGradNode(
std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
}
return ToPyObject(tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* eager_api_sparse_csr_tensor(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto non_zero_crows = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
auto non_zero_cols = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 1), 1);
auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 2), 2);
auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 3), 3);
auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
PADDLE_ENFORCE(non_zero_crows.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the compressed non-zero rows must be a DenseTensor."));
PADDLE_ENFORCE(non_zero_cols.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero cols must be a DenseTensor."));
PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero elements must be a DenseTensor."));
auto dense_crows =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_crows.impl());
auto dense_cols =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_cols.impl());
auto dense_elements =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_elements.impl());
std::shared_ptr<phi::SparseCsrTensor> csr_tensor =
std::make_shared<phi::SparseCsrTensor>(*dense_crows, *dense_cols,
*dense_elements,
phi::make_ddim(dense_shape));
paddle::experimental::Tensor tensor;
tensor.set_impl(csr_tensor);
auto name =
egr::Controller::Instance().GenerateUniqueName("generated_tensor");
tensor.set_name(name);
auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
if (!autograd_meta->GetMutableGradNode()) {
VLOG(3) << "Tensor(" << name
<< ") have not GradNode, add GradNodeAccumulation for it.";
autograd_meta->SetGradNode(
std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
}
return ToPyObject(tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyMethodDef variable_functions[] = { PyMethodDef variable_functions[] = {
// TODO(jiabin): Remove scale when we have final state tests // TODO(jiabin): Remove scale when we have final state tests
{"scale", (PyCFunction)(void (*)(void))eager_api_scale, {"scale", (PyCFunction)(void (*)(void))eager_api_scale,
...@@ -490,6 +577,14 @@ PyMethodDef variable_functions[] = { ...@@ -490,6 +577,14 @@ PyMethodDef variable_functions[] = {
{"read_next_tensor_list", {"read_next_tensor_list",
(PyCFunction)(void (*)(void))eager_api_read_next_tensor_list, (PyCFunction)(void (*)(void))eager_api_read_next_tensor_list,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
/**sparse functions**/
{"sparse_coo_tensor",
(PyCFunction)(void (*)(void))eager_api_sparse_coo_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
{"sparse_csr_tensor",
(PyCFunction)(void (*)(void))eager_api_sparse_csr_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
/**sparse functions**/
{NULL, NULL, 0, NULL}}; {NULL, NULL, 0, NULL}};
void BindFunctions(PyObject* module) { void BindFunctions(PyObject* module) {
......
...@@ -959,11 +959,11 @@ static PyObject* tensor__set_grad_type(TensorObject* self, PyObject* args, ...@@ -959,11 +959,11 @@ static PyObject* tensor__set_grad_type(TensorObject* self, PyObject* args,
EAGER_TRY EAGER_TRY
auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0); auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0);
auto grad_tensor = auto grad_tensor =
egr::EagerUtils::unsafe_autograd_meta(self->tensor)->Grad(); egr::EagerUtils::unsafe_autograd_meta(self->tensor)->MutableGrad();
if (var_type == framework::proto::VarType::LOD_TENSOR) { if (var_type == framework::proto::VarType::LOD_TENSOR) {
grad_tensor.set_impl(std::make_shared<phi::DenseTensor>()); grad_tensor->set_impl(std::make_shared<phi::DenseTensor>());
} else if (var_type == framework::proto::VarType::SELECTED_ROWS) { } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
grad_tensor.set_impl(std::make_shared<phi::SelectedRows>()); grad_tensor->set_impl(std::make_shared<phi::SelectedRows>());
} }
return Py_None; return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
...@@ -1097,6 +1097,49 @@ static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args, ...@@ -1097,6 +1097,49 @@ static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* tensor_method_to_sparse_coo(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
int64_t sparse_dim = CastPyArg2AttrLong(PyTuple_GET_ITEM(args, 0), 0);
auto coo_tensor = self->tensor.to_sparse_coo(sparse_dim);
egr::EagerUtils::autograd_meta(&coo_tensor)
->SetStopGradient(
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient());
egr::EagerUtils::autograd_meta(&coo_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(coo_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_to_sparse_csr(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto csr_tensor = self->tensor.to_sparse_csr();
egr::EagerUtils::autograd_meta(&csr_tensor)
->SetStopGradient(
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient());
egr::EagerUtils::autograd_meta(&csr_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(csr_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_to_dense(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto dense_tensor = self->tensor.to_dense();
egr::EagerUtils::autograd_meta(&dense_tensor)
->SetStopGradient(
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient());
egr::EagerUtils::autograd_meta(&dense_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(dense_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args, static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
...@@ -1185,6 +1228,12 @@ PyMethodDef variable_methods[] = { ...@@ -1185,6 +1228,12 @@ PyMethodDef variable_methods[] = {
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr, {"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"to_sparse_coo", (PyCFunction)(void (*)(void))tensor_method_to_sparse_coo,
METH_VARARGS | METH_KEYWORDS, NULL},
{"to_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_to_sparse_csr,
METH_VARARGS | METH_KEYWORDS, NULL},
{"to_dense", (PyCFunction)(void (*)(void))tensor_method_to_dense,
METH_VARARGS | METH_KEYWORDS, NULL},
/***the method of sparse tensor****/ /***the method of sparse tensor****/
{"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version, {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
......
...@@ -33,19 +33,21 @@ namespace tensorrt { ...@@ -33,19 +33,21 @@ namespace tensorrt {
static nvinfer1::IBuilder* createInferBuilder( static nvinfer1::IBuilder* createInferBuilder(
nvinfer1::ILogger& logger) { // NOLINT nvinfer1::ILogger& logger) { // NOLINT
return static_cast<nvinfer1::IBuilder*>( return static_cast<nvinfer1::IBuilder*>(
phi::dynload::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); ::phi::dynload::createInferBuilder_INTERNAL(&logger,
NV_TENSORRT_VERSION));
} }
static nvinfer1::IRuntime* createInferRuntime( static nvinfer1::IRuntime* createInferRuntime(
nvinfer1::ILogger& logger) { // NOLINT nvinfer1::ILogger& logger) { // NOLINT
return static_cast<nvinfer1::IRuntime*>( return static_cast<nvinfer1::IRuntime*>(
phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); ::phi::dynload::createInferRuntime_INTERNAL(&logger,
NV_TENSORRT_VERSION));
} }
TrtEngine::TrtEngine(int device_id) : device_id_(device_id) { TrtEngine::TrtEngine(int device_id) : device_id_(device_id) {
FreshDeviceId(); FreshDeviceId();
logger_.reset(new TrtLogger()); logger_.reset(new TrtLogger());
builder_.reset(createInferBuilder(logger_->GetTrtLogger())); builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), ""); ::phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
} }
nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() { nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() {
...@@ -237,11 +239,11 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build, ...@@ -237,11 +239,11 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
} }
void TrtEngine::PrepareOutputHandle(const std::string& out_name) { void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
phi::DenseTensor t; ::phi::DenseTensor t;
outputs_.emplace(out_name, t); outputs_.emplace(out_name, t);
} }
phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) { ::phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
return &outputs_[name]; return &outputs_[name];
} }
...@@ -249,7 +251,7 @@ size_t TrtEngine::GetOutputNum() const { return outputs_.size(); } ...@@ -249,7 +251,7 @@ size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
bool TrtEngine::SetUpInference( bool TrtEngine::SetUpInference(
const InferenceOptions& inference, const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs) { const std::unordered_map<std::string, ::phi::DenseTensor*>& inputs) {
// TODO(wilber): now only create one exec_context // TODO(wilber): now only create one exec_context
FreshDeviceId(); FreshDeviceId();
CHECK(engine_ != nullptr); CHECK(engine_ != nullptr);
...@@ -272,7 +274,7 @@ bool TrtEngine::SetUpInference( ...@@ -272,7 +274,7 @@ bool TrtEngine::SetUpInference(
return true; return true;
} }
void TrtEngine::Run(const phi::GPUContext& ctx) { void TrtEngine::Run(const ::phi::GPUContext& ctx) {
if (is_dynamic_shape_) { if (is_dynamic_shape_) {
DynamicRun(ctx); DynamicRun(ctx);
} else { } else {
...@@ -280,7 +282,7 @@ void TrtEngine::Run(const phi::GPUContext& ctx) { ...@@ -280,7 +282,7 @@ void TrtEngine::Run(const phi::GPUContext& ctx) {
} }
} }
void TrtEngine::StaticRun(const phi::GPUContext& ctx) { void TrtEngine::StaticRun(const ::phi::GPUContext& ctx) {
const int num_bindings = engine_->getNbBindings(); const int num_bindings = engine_->getNbBindings();
std::vector<void*> buffers(num_bindings, nullptr); std::vector<void*> buffers(num_bindings, nullptr);
...@@ -291,7 +293,8 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { ...@@ -291,7 +293,8 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
buffers[bind_index] = buffers[bind_index] =
const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>())); const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>()));
if (runtime_batch != -1) { if (runtime_batch != -1) {
CHECK_EQ(runtime_batch, phi::vectorize<int64_t>(bind.buffer->dims())[0]); CHECK_EQ(runtime_batch,
::phi::vectorize<int64_t>(bind.buffer->dims())[0]);
} }
runtime_batch = bind.buffer->dims()[0]; runtime_batch = bind.buffer->dims()[0];
} }
...@@ -306,7 +309,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { ...@@ -306,7 +309,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
for (int i = 0; i < dims.nbDims; ++i) { for (int i = 0; i < dims.nbDims; ++i) {
ddim.push_back(dims.d[i]); ddim.push_back(dims.d[i]);
} }
bind.buffer->Resize(phi::make_ddim(ddim)); bind.buffer->Resize(::phi::make_ddim(ddim));
// TODO(wilber): now only support float output. // TODO(wilber): now only support float output.
ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel()); ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>()); buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
...@@ -316,7 +319,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { ...@@ -316,7 +319,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
runtime_batch, buffers.data(), ctx.stream(), nullptr); runtime_batch, buffers.data(), ctx.stream(), nullptr);
} }
void TrtEngine::DynamicRun(const phi::GPUContext& ctx) { void TrtEngine::DynamicRun(const ::phi::GPUContext& ctx) {
const int num_bindings = engine_->getNbBindings(); const int num_bindings = engine_->getNbBindings();
std::vector<void*> buffers(num_bindings, nullptr); std::vector<void*> buffers(num_bindings, nullptr);
...@@ -344,7 +347,7 @@ void TrtEngine::DynamicRun(const phi::GPUContext& ctx) { ...@@ -344,7 +347,7 @@ void TrtEngine::DynamicRun(const phi::GPUContext& ctx) {
for (int i = 0; i < dims.nbDims; ++i) { for (int i = 0; i < dims.nbDims; ++i) {
ddim[i] = dims.d[i]; ddim[i] = dims.d[i];
} }
bind.buffer->Resize(phi::make_ddim(ddim)); bind.buffer->Resize(::phi::make_ddim(ddim));
ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel()); ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>()); buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
} }
...@@ -356,7 +359,7 @@ void TrtEngine::FreshDeviceId() { ...@@ -356,7 +359,7 @@ void TrtEngine::FreshDeviceId() {
int count; int count;
cudaGetDeviceCount(&count); cudaGetDeviceCount(&count);
CHECK_LT(device_id_, count); CHECK_LT(device_id_, count);
phi::backends::gpu::SetDeviceId(device_id_); ::phi::backends::gpu::SetDeviceId(device_id_);
} }
void TrtEngine::GetEngineInfo() { void TrtEngine::GetEngineInfo() {
......
...@@ -76,19 +76,19 @@ class TrtEngine { ...@@ -76,19 +76,19 @@ class TrtEngine {
const BuildOptions& build_options); const BuildOptions& build_options);
// TODO(wilber): Modify signature after infrt-trt ready. // TODO(wilber): Modify signature after infrt-trt ready.
void Run(const phi::GPUContext& ctx); void Run(const ::phi::GPUContext& ctx);
// TODO(wilber): How to support multiple execution contexts? // TODO(wilber): How to support multiple execution contexts?
bool SetUpInference( bool SetUpInference(
const InferenceOptions& inference, const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs); const std::unordered_map<std::string, ::phi::DenseTensor*>& inputs);
void GetEngineInfo(); void GetEngineInfo();
void PrepareOutputHandle(const std::string& out_name); void PrepareOutputHandle(const std::string& out_name);
// TODO(wilber): The output tensor names are: output_0, output_1, ... // TODO(wilber): The output tensor names are: output_0, output_1, ...
phi::DenseTensor* GetOutput(const std::string&); ::phi::DenseTensor* GetOutput(const std::string&);
size_t GetOutputNum() const; size_t GetOutputNum() const;
...@@ -104,9 +104,9 @@ class TrtEngine { ...@@ -104,9 +104,9 @@ class TrtEngine {
bool ModelToBuildEnv(TrtUniquePtr<nvinfer1::INetworkDefinition> network, bool ModelToBuildEnv(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
const BuildOptions& build); const BuildOptions& build);
void StaticRun(const phi::GPUContext& ctx); void StaticRun(const ::phi::GPUContext& ctx);
void DynamicRun(const phi::GPUContext& ctx); void DynamicRun(const ::phi::GPUContext& ctx);
private: private:
std::unique_ptr<TrtLogger> logger_{nullptr}; std::unique_ptr<TrtLogger> logger_{nullptr};
...@@ -118,7 +118,7 @@ class TrtEngine { ...@@ -118,7 +118,7 @@ class TrtEngine {
std::vector<std::unique_ptr<Bindings>> bindings_; std::vector<std::unique_ptr<Bindings>> bindings_;
int device_id_{0}; int device_id_{0};
bool is_dynamic_shape_{false}; bool is_dynamic_shape_{false};
std::unordered_map<std::string, phi::DenseTensor> outputs_; std::unordered_map<std::string, ::phi::DenseTensor> outputs_;
}; };
} // namespace tensorrt } // namespace tensorrt
......
...@@ -92,7 +92,7 @@ class TrtLogger : public nvinfer1::ILogger { ...@@ -92,7 +92,7 @@ class TrtLogger : public nvinfer1::ILogger {
struct Binding { struct Binding {
bool is_input{false}; bool is_input{false};
nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT}; nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT};
phi::DenseTensor* buffer{nullptr}; ::phi::DenseTensor* buffer{nullptr};
std::string name; std::string name;
}; };
...@@ -103,7 +103,7 @@ class Bindings { ...@@ -103,7 +103,7 @@ class Bindings {
void AddBinding(int32_t b, void AddBinding(int32_t b,
const std::string& name, const std::string& name,
bool is_input, bool is_input,
phi::DenseTensor* buffer, ::phi::DenseTensor* buffer,
nvinfer1::DataType data_type) { nvinfer1::DataType data_type) {
while (bindings_.size() <= static_cast<size_t>(b)) { while (bindings_.size() <= static_cast<size_t>(b)) {
bindings_.emplace_back(); bindings_.emplace_back();
......
...@@ -97,4 +97,17 @@ def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { ...@@ -97,4 +97,17 @@ def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
let results = (outs DenseTensor:$output); let results = (outs DenseTensor:$output);
} }
// TODO(wilber): Add a infrt_gpu dialect.
def PDT_GpuMemCopyOp : PDT_Op<"memcpy.gpu", [NoSideEffect]> {
let summary = "phi_dt.gpu.memcpy";
let description = [{gpu memcpy d2h or h2d}];
// TODO(wilber): add context argument to support stream.
let arguments = (ins
DenseTensor:$input,
Context:$context,
BoolAttr:$d2h
);
let results = (outs DenseTensor:$output);
}
#endif #endif
...@@ -97,12 +97,13 @@ void PhiOpConvertPass::convertStage() { ...@@ -97,12 +97,13 @@ void PhiOpConvertPass::convertStage() {
} }
auto loc = getFunction().getLoc(); auto loc = getFunction().getLoc();
builder.setInsertionPoint(op); builder.setInsertionPoint(op);
if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) {
std::string kernel_name = phi::TransToPhiKernelName(op_name); if (!::phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_name)) {
op_name = phi::TransToPhiKernelName(op_name);
auto kernel_op = builder.create<infrt::KernelOp>(loc, auto kernel_op = builder.create<infrt::KernelOp>(loc,
op->getResultTypes(), op->getResultTypes(),
op->getOperands(), op->getOperands(),
kernel_name, op_name,
op->getAttrDictionary()); op->getAttrDictionary());
op->replaceAllUsesWith(kernel_op.getResults()); op->replaceAllUsesWith(kernel_op.getResults());
} else { } else {
......
...@@ -32,17 +32,24 @@ bool ProtoArgumentMappingContext::HasOutput(const std::string& name) const { ...@@ -32,17 +32,24 @@ bool ProtoArgumentMappingContext::HasOutput(const std::string& name) const {
} }
bool ProtoArgumentMappingContext::HasAttr(const std::string& name) const { bool ProtoArgumentMappingContext::HasAttr(const std::string& name) const {
if (name == "is_test") return true;
return op_->hasAttr(name); return op_->hasAttr(name);
} }
paddle::any ProtoArgumentMappingContext::Attr(const std::string& name) const { paddle::any ProtoArgumentMappingContext::Attr(const std::string& name) const {
mlir::Attribute attrs = op_->getAttr(name); if (name == "is_test") {
if (mlir::StringAttr str_attr = attrs.dyn_cast_or_null<mlir::StringAttr>()) { return paddle::any(true);
}
mlir::Attribute attr = op_->getAttr(name);
if (!attr) {
return paddle::any();
}
if (mlir::StringAttr str_attr = attr.dyn_cast<mlir::StringAttr>()) {
return paddle::any(str_attr.str()); return paddle::any(str_attr.str());
} else {
// ToDO: implementation in the ext PR.
return paddle::any(0);
} }
// ToDO: implementation in the ext PR.
return paddle::any(0);
} }
size_t ProtoArgumentMappingContext::InputSize(const std::string& name) const { size_t ProtoArgumentMappingContext::InputSize(const std::string& name) const {
......
...@@ -6,6 +6,7 @@ gather_srcs(infrt_src SRCS ...@@ -6,6 +6,7 @@ gather_srcs(infrt_src SRCS
trt_op_teller_pass.cc trt_op_teller_pass.cc
trt_graph_fuse_pass.cc trt_graph_fuse_pass.cc
trt_graph_split_pass.cc trt_graph_split_pass.cc
trt_type_convert_pass.cc
) )
mlir_tablegen_on(trt_ops) mlir_tablegen_on(trt_ops)
mlir_add_rewriter(pd_lower_to_trt) mlir_add_rewriter(pd_lower_to_trt)
......
...@@ -21,6 +21,26 @@ ...@@ -21,6 +21,26 @@
#include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
#include "paddle/infrt/host_context/core_runtime.h"
#include "paddle/infrt/host_context/kernel_registry.h"
#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
#include "paddle/infrt/kernel/basic_kernels.h"
#include "paddle/infrt/kernel/control_flow_kernels.h"
#include "paddle/infrt/kernel/tensor_kernels.h"
#include "paddle/infrt/kernel/tensor_shape_kernels.h"
#include "paddle/infrt/kernel/test_kernels.h"
#include "paddle/infrt/kernel/tensorrt/registry.h"
#ifdef INFRT_WITH_PHI
#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
#include "paddle/infrt/kernel/phi/registry.h"
#endif
int main(int argc, char** argv) { int main(int argc, char** argv) {
static llvm::cl::opt<std::string> input_file( static llvm::cl::opt<std::string> input_file(
...@@ -33,6 +53,22 @@ int main(int argc, char** argv) { ...@@ -33,6 +53,22 @@ int main(int argc, char** argv) {
mlir::MLIRContext* context = infrt::Global::getMLIRContext(); mlir::MLIRContext* context = infrt::Global::getMLIRContext();
auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context); auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
infrt::host_context::KernelRegistry registry;
::infrt::kernel::RegisterBasicKernels(&registry);
::infrt::kernel::RegisterTestKernels(&registry);
::infrt::kernel::RegisterTensorShapeKernels(&registry);
::infrt::kernel::RegisterTensorKernels(&registry);
::infrt::kernel::RegisterControlFlowKernels(&registry);
#ifdef INFRT_WITH_PHI
::infrt::kernel::RegisterPhiKernels(&registry);
::infrt::kernel::RegisterInferShapeLaunchers(&registry);
#endif
#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
::infrt::kernel::RegisterTrtKernels(&registry);
#endif
context->loadAllAvailableDialects();
module->dump(); module->dump();
mlir::PassManager pm(context); mlir::PassManager pm(context);
...@@ -41,10 +77,12 @@ int main(int argc, char** argv) { ...@@ -41,10 +77,12 @@ int main(int argc, char** argv) {
trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphFusePass>()); trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphFusePass>());
trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphSplitPass>(1)); trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphSplitPass>(1));
trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpConverterPass>()); trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpConverterPass>());
trt_pass_manager.addPass(infrt::trt::createTrtTypeConvertPass());
if (mlir::failed(pm.run(*module))) { if (mlir::failed(pm.run(*module))) {
std::cout << "\npass failed!\n" << std::endl; std::cout << "\npass failed!\n" << std::endl;
return 4; return 4;
} }
module->dump(); module->dump();
::infrt::host_context::TestMlir(module.get(), &registry);
return 0; return 0;
} }
...@@ -12,10 +12,17 @@ ...@@ -12,10 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
#include <glog/logging.h>
#include <mlir/IR/Builders.h> #include <mlir/IR/Builders.h>
#include <mlir/Transforms/DialectConversion.h> #include <mlir/Transforms/DialectConversion.h>
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
namespace infrt { namespace infrt {
namespace trt { namespace trt {
...@@ -41,34 +48,34 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern { ...@@ -41,34 +48,34 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()), ::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()),
trt_inputs, trt_inputs,
true /*run_once*/); true /*run_once*/);
::mlir::Block *block = new ::mlir::Block; auto &block = create_engine_op.body().emplaceBlock();
block->getOperations().splice(block->begin(), block.getOperations().splice(block.begin(),
casted_op.getBody()->getOperations(), casted_op.getBody()->getOperations(),
casted_op.getBody()->begin(), casted_op.getBody()->begin(),
casted_op.getBody()->end()); casted_op.getBody()->end());
create_engine_op.body().push_back(block);
// trt.execute // trt.compute
// outputs ::llvm::SmallVector<::mlir::Value, 4> replace_values2;
::llvm::SmallVector<::mlir::Type, 4> execute_outputs_types; auto ctx_op = rewriter.create<::infrt::phi::CreateGPUContextOp>(
for (auto v : casted_op.getODSResults(0)) { ods_loc,
execute_outputs_types.push_back(v.getType()); infrt::phi::ContextType::get(rewriter.getContext(),
} infrt::TargetType::GPU));
// inputs auto compute_op = rewriter.create<EngineComputeOp>(
::mlir::SmallVector<::mlir::Value, 4> execute_inputs( ods_loc,
create_engine_op.getODSResults(0)); ::infrt::DenseTensorListType::get(rewriter.getContext()),
for (auto v : inputs) { create_engine_op.engine(),
execute_inputs.push_back(v); ctx_op.output());
} auto tensor_list_val = compute_op.outputs();
auto execute_op = rewriter.create<ExecuteOp>( for (size_t i = 0; i < casted_op.getNumResults(); ++i) {
ods_loc, execute_outputs_types, execute_inputs); auto res = casted_op->getResult(i);
auto int_attr = mlir::IntegerAttr::get(
::llvm::SmallVector<::mlir::Value, 4> replace_values; mlir::IntegerType::get(rewriter.getContext(), 32), i);
for (auto v : auto get_tensor_op = rewriter.create<::infrt::dt::TensorListGetTensorOp>(
::llvm::SmallVector<::mlir::Value, 4>{execute_op.getODSResults(0)}) { ods_loc, res.getType(), tensor_list_val, int_attr);
replace_values.push_back(v); replace_values2.push_back(get_tensor_op.output());
} }
rewriter.replaceOp(op, replace_values); ctx_op->moveBefore(ctx_op->getBlock(), ctx_op->getBlock()->begin());
rewriter.replaceOp(op, replace_values2);
return ::mlir::success(); return ::mlir::success();
} }
}; };
...@@ -82,6 +89,9 @@ void TRTOpConverterPass::runOnOperation() { ...@@ -82,6 +89,9 @@ void TRTOpConverterPass::runOnOperation() {
// this lowering. In our case, we are lowering to TensorRTDialect from // this lowering. In our case, we are lowering to TensorRTDialect from
// PaddleDialect // PaddleDialect
target.addLegalDialect<TensorRTDialect>(); target.addLegalDialect<TensorRTDialect>();
target.addLegalDialect<::infrt::phi::PHIDialect>();
target.addLegalDialect<::infrt::dt::DTDialect>();
target.addLegalDialect<phi::PHIDenseTensorDialect>();
// Now that the conversion target has been defined, we just need to provide // Now that the conversion target has been defined, we just need to provide
// the set of patterns that will lower the TensorRT operations. // the set of patterns that will lower the TensorRT operations.
......
...@@ -14,7 +14,9 @@ ...@@ -14,7 +14,9 @@
#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
#include <llvm/Support/Casting.h>
#include <mlir/IR/Builders.h> #include <mlir/IR/Builders.h>
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
...@@ -35,10 +37,12 @@ void TRTOpTellerPass::runOnFunction() { ...@@ -35,10 +37,12 @@ void TRTOpTellerPass::runOnFunction() {
auto *op = worklist.back(); auto *op = worklist.back();
worklist.pop_back(); worklist.pop_back();
if (op == nullptr) continue; if (op == nullptr) continue;
if (op->getName().getStringRef().substr(0, 3) != "pd.") continue;
if (::llvm::dyn_cast_or_null<infrt::pd::FeedOp>(op)) continue; if (::llvm::dyn_cast_or_null<infrt::pd::FeedOp>(op)) continue;
if (::llvm::dyn_cast_or_null<infrt::pd::FetchOp>(op)) continue; if (::llvm::dyn_cast_or_null<infrt::pd::FetchOp>(op)) continue;
if (::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(op)) continue; if (::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(op)) continue;
if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue; if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
builder.setInsertionPoint(op); builder.setInsertionPoint(op);
auto loc = getFunction().getLoc(); auto loc = getFunction().getLoc();
auto graph_op = builder.create<infrt::pd::GraphOp>( auto graph_op = builder.create<infrt::pd::GraphOp>(
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
#include <glog/logging.h>
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Casting.h"
#include "mlir/IR/Block.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/OperationSupport.h"
#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "paddle/infrt/dialect/infrt/common/types.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
namespace {
class TrtTypeConvertPass
: public mlir::PassWrapper<TrtTypeConvertPass, mlir::FunctionPass> {
public:
::llvm::StringRef getName() const override { return "TrtTypeConvertPass"; }
void runOnFunction() override;
};
void TrtTypeConvertPass::runOnFunction() {
mlir::Block& body = getFunction().front();
auto* mlir_ctx = getFunction()->getContext();
mlir::OpBuilder builder(&body, body.begin());
std::vector<mlir::Operation*> worklist;
mlir::Operation* ctx_op{nullptr};
worklist.reserve(body.getOperations().size());
for (auto& op : body) {
worklist.push_back(&op);
if (op.getName().getStringRef() == "phi_dt.create_context.gpu") {
ctx_op = &op;
}
}
::infrt::LayoutType layout = ::infrt::LayoutType::NCHW;
::infrt::TargetType target = ::infrt::TargetType::GPU;
for (auto& op : worklist) {
if (auto tensor_map_get_op =
llvm::dyn_cast<::infrt::phi::TensorMapGetTensorOp>(op)) {
auto res = tensor_map_get_op.output();
if (auto t = res.getType().dyn_cast<::infrt::DenseTensorType>()) {
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, t.getTarget(), t.getPrecision(), layout);
res.setType(replace_type);
}
}
if (auto create_engine = llvm::dyn_cast<::infrt::trt::CreateEngineOp>(op)) {
// Insert `infrt.gpu.memcpy` op.
for (auto arg : create_engine.getOperands()) {
if (mlir::Operation* producer = arg.getDefiningOp()) {
if (arg.getType().isa<::infrt::DenseTensorType>()) {
builder.setInsertionPointAfter(producer);
auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>();
if (producer->getName().getStringRef() !=
"phi_dt.tensor_map_get_tensor" &&
t.getTarget() != ::infrt::TargetType::GPU) {
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, target, t.getPrecision(), layout);
CHECK_NOTNULL(ctx_op);
auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>(
arg.getLoc(),
replace_type,
arg,
llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op)
.output(),
mlir::BoolAttr::get(mlir_ctx, /*d2h*/ false));
arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op);
}
}
} else {
auto blockArg = arg.cast<mlir::BlockArgument>();
if (arg.getType().isa<::infrt::DenseTensorType>()) {
auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>();
builder.setInsertionPointAfter(ctx_op);
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout);
CHECK_NOTNULL(ctx_op);
auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>(
blockArg.getLoc(),
replace_type,
blockArg,
llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op)
.output(),
mlir::BoolAttr::get(mlir_ctx, /*d2h*/ false));
arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op);
}
}
}
// Change ops(in block) types.
auto& block = create_engine.getRegion().getBlocks().front();
for (auto& op : block.without_terminator()) {
for (size_t i = 0; i < op.getNumResults(); ++i) {
if (auto t = op.getResult(i)
.getType()
.dyn_cast<::infrt::DenseTensorType>()) {
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout);
op.getResult(i).setType(replace_type);
}
}
}
} else if (auto list_get_tensor_op =
llvm::dyn_cast<::infrt::dt::TensorListGetTensorOp>(op)) {
auto result = list_get_tensor_op.output();
if (auto t = result.getType().dyn_cast<::infrt::DenseTensorType>()) {
result.setType(::infrt::DenseTensorType::get(
mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout));
}
} else if (auto return_op = llvm::dyn_cast<::infrt::ReturnOp>(op)) {
for (auto arg : return_op->getOperands()) {
if (auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>()) {
if (t.getLayout() != ::infrt::LayoutType::ANY ||
t.getTarget() != ::infrt::TargetType::CPU ||
t.getPrecision() != ::infrt::PrecisionType::FLOAT32) {
builder.setInsertionPoint(return_op);
CHECK_NOTNULL(ctx_op);
auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>(
return_op.getLoc(),
::infrt::DenseTensorType::get(mlir_ctx,
::infrt::TargetType::CPU,
t.getPrecision(),
::infrt::LayoutType::ANY),
arg,
llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op)
.output(),
mlir::BoolAttr::get(mlir_ctx, /*d2h*/ true));
arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op);
}
}
}
}
}
}
} // namespace
namespace infrt {
namespace trt {
std::unique_ptr<mlir::Pass> createTrtTypeConvertPass() {
return std::make_unique<TrtTypeConvertPass>();
}
} // namespace trt
} // namespace infrt
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mlir/Pass/Pass.h>
namespace infrt {
namespace trt {
std::unique_ptr<mlir::Pass> createTrtTypeConvertPass();
} // namespace trt
} // namespace infrt
...@@ -130,7 +130,7 @@ boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute( ...@@ -130,7 +130,7 @@ boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
if (attr.isa<mlir::IntegerAttr>()) { if (attr.isa<mlir::IntegerAttr>()) {
auto val = attr.cast<mlir::IntegerAttr>(); auto val = attr.cast<mlir::IntegerAttr>();
if (val.getType().isInteger(32)) { if (val.getType().isInteger(32)) {
return val.getInt(); return val.getValue().getSExtValue();
} }
} }
return boost::none; return boost::none;
...@@ -142,7 +142,7 @@ boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute( ...@@ -142,7 +142,7 @@ boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
if (attr.isa<mlir::IntegerAttr>()) { if (attr.isa<mlir::IntegerAttr>()) {
auto val = attr.cast<mlir::IntegerAttr>(); auto val = attr.cast<mlir::IntegerAttr>();
if (val.getType().isInteger(64)) { if (val.getType().isInteger(64)) {
return val.getInt(); return val.getValue().getSExtValue();
} }
} }
return boost::none; return boost::none;
...@@ -233,7 +233,7 @@ boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute( ...@@ -233,7 +233,7 @@ boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
\ \
std::vector<type__> res; \ std::vector<type__> res; \
for (auto& v : array) { \ for (auto& v : array) { \
res.push_back(v.cast<mlir::IntegerAttr>().getInt()); \ res.push_back(v.cast<mlir::IntegerAttr>().getValue().getSExtValue()); \
} \ } \
return res; \ return res; \
} }
...@@ -309,7 +309,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( ...@@ -309,7 +309,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
arg_value = GetOpResult(upstream_op); arg_value = GetOpResult(upstream_op);
} }
} }
if (arg_value->is_type<phi::DenseTensor>()) { if (arg_value->is_type<::phi::DenseTensor>()) {
impl_->runtime->FeedInArgs( impl_->runtime->FeedInArgs(
std::make_pair(std::to_string(i), ValueRef(arg_value))); std::make_pair(std::to_string(i), ValueRef(arg_value)));
} }
......
...@@ -147,6 +147,7 @@ class Value : public common::Object { ...@@ -147,6 +147,7 @@ class Value : public common::Object {
#endif #endif
explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {} explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {} explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
explicit Value(::phi::MetaConfig&& x) : data(std::move(x)) {}
#ifdef INFRT_WITH_TRT #ifdef INFRT_WITH_TRT
explicit Value(::infrt::backends::tensorrt::TrtEngine&& x) explicit Value(::infrt::backends::tensorrt::TrtEngine&& x)
: data(std::move(x)) {} : data(std::move(x)) {}
......
...@@ -30,6 +30,7 @@ namespace phi { ...@@ -30,6 +30,7 @@ namespace phi {
::phi::GPUContext context; ::phi::GPUContext context;
context.PartialInitWithoutAllocator(); context.PartialInitWithoutAllocator();
context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{}); context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{});
context.SetHostAllocator(new backends::CpuPhiAllocator{});
context.PartialInitWithAllocator(); context.PartialInitWithAllocator();
return context; return context;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
#include "llvm/Support/ErrorHandling.h"
#include "paddle/infrt/common/string.h" #include "paddle/infrt/common/string.h"
#include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/infrt/dialect/phi/data_type.h"
#include "paddle/infrt/kernel/phi/context_kernels.h" #include "paddle/infrt/kernel/phi/context_kernels.h"
...@@ -228,6 +229,69 @@ int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map) { ...@@ -228,6 +229,69 @@ int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map) {
return map.size(); return map.size();
} }
#ifdef INFRT_WITH_GPU
inline size_t SizeOfDataType(::phi::DataType data_type) {
switch (data_type) {
case ::phi::DataType::BOOL:
case ::phi::DataType::UINT8:
case ::phi::DataType::INT8:
return 1;
case ::phi::DataType::BFLOAT16:
case ::phi::DataType::FLOAT16:
case ::phi::DataType::INT16:
case ::phi::DataType::UINT16:
return 2;
case ::phi::DataType::FLOAT32:
case ::phi::DataType::INT32:
case ::phi::DataType::UINT32:
return 4;
case ::phi::DataType::FLOAT64:
case ::phi::DataType::INT64:
case ::phi::DataType::UINT64:
case ::phi::DataType::COMPLEX64:
return 8;
case ::phi::DataType::COMPLEX128:
return 16;
case ::phi::DataType::UNDEFINED:
return 0;
default:
llvm_unreachable("should not reach here");
return 0;
}
return 0;
}
::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input,
const ::phi::GPUContext& context,
bool d2h) {
if (d2h) {
::phi::DenseTensor ret(
const_cast<::phi::Allocator*>(&context.GetHostAllocator()),
input.meta());
CHECK(input.place().GetType() == ::phi::AllocationType::GPU);
// TODO(wilber): Add sync op and stream.
cudaMemcpyAsync(ret.data(),
input.data(),
SizeOfDataType(input.dtype()) * input.numel(),
cudaMemcpyDeviceToHost,
nullptr);
return ret;
} else {
// h2d
::phi::DenseTensor ret(
const_cast<::phi::Allocator*>(&context.GetAllocator()), input.meta());
CHECK(input.place().GetType() == ::phi::AllocationType::CPU ||
input.place().GetType() == ::phi::AllocationType::GPUPINNED);
// TODO(wilber): Add sync op and stream.
cudaMemcpyAsync(ret.data(),
input.data(),
SizeOfDataType(input.dtype()) * input.numel(),
cudaMemcpyHostToDevice,
nullptr);
return ret;
}
}
#endif
} // namespace phi } // namespace phi
} // namespace kernel } // namespace kernel
} // namespace infrt } // namespace infrt
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/dialect/infrt/common/types.h"
#include "paddle/infrt/host_context/kernel_utils.h" #include "paddle/infrt/host_context/kernel_utils.h"
#include "paddle/infrt/tensor/phi/tensor_map.h" #include "paddle/infrt/tensor/phi/tensor_map.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace infrt { namespace infrt {
...@@ -55,6 +56,12 @@ infrt::phi::DenseTensorMap LoadParams( ...@@ -55,6 +56,12 @@ infrt::phi::DenseTensorMap LoadParams(
int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map); int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map);
#ifdef INFRT_WITH_GPU
::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input,
const ::phi::GPUContext& context,
bool d2h);
#endif
} // namespace phi } // namespace phi
} // namespace kernel } // namespace kernel
} // namespace infrt } // namespace infrt
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h" #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/meta_tensor.h"
namespace infrt { namespace infrt {
namespace kernel { namespace kernel {
...@@ -31,6 +32,10 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape( ...@@ -31,6 +32,10 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
infershape_kernel_frame_builder.AddArgument(value); infershape_kernel_frame_builder.AddArgument(value);
} }
} }
if (infershape_kernel_frame_builder.GetNumArgs() < arg_size_) {
infershape_kernel_frame_builder.AddArgument(
new host_context::Value(::phi::MetaConfig()));
}
} }
void InferShapedKernelLauncher::BuildInferShapeCache( void InferShapedKernelLauncher::BuildInferShapeCache(
......
...@@ -22,11 +22,8 @@ namespace infrt { ...@@ -22,11 +22,8 @@ namespace infrt {
namespace kernel { namespace kernel {
struct InferShapedKernelLauncher { struct InferShapedKernelLauncher {
virtual void Invoke(host_context::KernelFrame* frame) = 0; explicit InferShapedKernelLauncher(int arg_size) : arg_size_(arg_size) {}
~InferShapedKernelLauncher() = default;
virtual ~InferShapedKernelLauncher() = default;
protected:
//! Initialize the kernel frame for InferShape kernel. //! Initialize the kernel frame for InferShape kernel.
// This method will create a new KernelFrame with all the Tensors(currently // This method will create a new KernelFrame with all the Tensors(currently
// only DenseHostTensor) converted into MetaTensors so that the infer-shape // only DenseHostTensor) converted into MetaTensors so that the infer-shape
...@@ -46,6 +43,7 @@ struct InferShapedKernelLauncher { ...@@ -46,6 +43,7 @@ struct InferShapedKernelLauncher {
llvm::SmallVector<host_context::ValueRef, 3> values; llvm::SmallVector<host_context::ValueRef, 3> values;
llvm::SmallVector<::phi::DDim, 3> tensor_shape_cache; llvm::SmallVector<::phi::DDim, 3> tensor_shape_cache;
host_context::KernelFrameBuilder infershape_kernel_frame_builder; host_context::KernelFrameBuilder infershape_kernel_frame_builder;
const int arg_size_;
}; };
} // namespace kernel } // namespace kernel
......
...@@ -24,46 +24,44 @@ ...@@ -24,46 +24,44 @@
namespace infrt { namespace infrt {
namespace kernel { namespace kernel {
template <typename F>
struct FuncArgStatics {};
template <typename Return, typename... Args>
struct FuncArgStatics<Return (*)(Args...)> {
constexpr static int arg_size = sizeof...(Args);
};
template <typename KernelFunc, template <typename KernelFunc,
KernelFunc kernel, KernelFunc kernel,
typename InferShapedFunc, typename InferShapedFunc,
InferShapedFunc infershape> InferShapedFunc infershape>
class KernelLauncher : public InferShapedKernelLauncher { void KernelLauncherFunc(host_context::KernelFrame* frame) {
public: static InferShapedKernelLauncher launcher(
FuncArgStatics<InferShapedFunc>::arg_size);
static const uint16_t num_input_tensors{InferShapeHelper<KernelFunc>::count}; static const uint16_t num_input_tensors{InferShapeHelper<KernelFunc>::count};
static const bool turn_on_infer_shape_cache{true}; static const bool turn_on_infer_shape_cache{true};
void Invoke(host_context::KernelFrame* frame) override {
#ifndef NDEBUG #ifndef NDEBUG
LOG(INFO) << "Kernel.frame: " << frame->DumpArgTypes(); LOG(INFO) << "Kernel.frame: " << frame->DumpArgTypes();
#endif #endif
// Build the infershape KernelFrame if needed. // Build the infershape KernelFrame if needed.
// TODO(Superjomn) add unlikely here. // TODO(Superjomn) add unlikely here.
if (infershape_kernel_frame_builder.IsEmpty()) { if (launcher.infershape_kernel_frame_builder.IsEmpty()) {
CreateKernelFrameForInferShape(frame); launcher.CreateKernelFrameForInferShape(frame);
#ifndef NDEBUG #ifndef NDEBUG
LOG(INFO) << "infershape.frame: " LOG(INFO) << "infershape.frame: "
<< infershape_kernel_frame_builder.DumpArgTypes(); << launcher.infershape_kernel_frame_builder.DumpArgTypes();
#endif #endif
}
if (turn_on_infer_shape_cache) {
if (launcher.IsShapeChanged(num_input_tensors)) {
::infrt::host_context::KernelImpl<InferShapedFunc, infershape>::Invoke(
&launcher.infershape_kernel_frame_builder);
launcher.BuildInferShapeCache(num_input_tensors);
} }
if (turn_on_infer_shape_cache) {
if (!turn_on_infer_shape_cache || IsShapeChanged(num_input_tensors)) {
::infrt::host_context::KernelImpl<InferShapedFunc, infershape>::Invoke(
&infershape_kernel_frame_builder);
BuildInferShapeCache(num_input_tensors);
}
}
::infrt::host_context::KernelImpl<KernelFunc, kernel>::Invoke(frame);
} }
}; ::infrt::host_context::KernelImpl<KernelFunc, kernel>::Invoke(frame);
template <typename KernelFunc,
KernelFunc kernel,
typename InferShapedFunc,
InferShapedFunc infershape>
void KernelLauncherFunc(
KernelLauncher<KernelFunc, kernel, InferShapedFunc, infershape> launcher,
host_context::KernelFrame* frame) {
launcher.Invoke(frame);
} }
} // namespace kernel } // namespace kernel
......
...@@ -52,6 +52,9 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { ...@@ -52,6 +52,9 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
"phi_dt.create_dense_tensor.gpu", "phi_dt.create_dense_tensor.gpu",
INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor), INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor),
{"dims", "lod", "layout", "precision"}); {"dims", "lod", "layout", "precision"});
registry->AddKernelWithAttrs("phi_dt.memcpy.gpu",
INFRT_KERNEL(infrt::kernel::phi::GpuMemCpy),
{"d2h"});
#endif #endif
registry->AddKernelWithAttrs("phi_dt.load_params", registry->AddKernelWithAttrs("phi_dt.load_params",
INFRT_KERNEL(infrt::kernel::phi::LoadParams), INFRT_KERNEL(infrt::kernel::phi::LoadParams),
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" #include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
#include <string> #include <string>
#include <unordered_set>
#include "NvInfer.h" #include "NvInfer.h"
#include "NvInferRuntime.h" #include "NvInferRuntime.h"
#include "NvInferRuntimeCommon.h" #include "NvInferRuntimeCommon.h"
...@@ -68,7 +69,7 @@ namespace tensorrt { ...@@ -68,7 +69,7 @@ namespace tensorrt {
auto& region = operation.getRegion(0); auto& region = operation.getRegion(0);
auto& block = region.getBlocks().front(); auto& block = region.getBlocks().front();
std::unordered_map<std::string, phi::DenseTensor*> trt_bind_inputs; std::unordered_map<std::string, ::phi::DenseTensor*> trt_bind_inputs;
ValueToITensorMap value_to_trt_tensor_map; ValueToITensorMap value_to_trt_tensor_map;
ValueToTensorMap value_to_tensor_map; ValueToTensorMap value_to_tensor_map;
...@@ -79,7 +80,7 @@ namespace tensorrt { ...@@ -79,7 +80,7 @@ namespace tensorrt {
const std::string input_name = "input_" + std::to_string(idx); const std::string input_name = "input_" + std::to_string(idx);
auto* v = symbol_table->GetValue(std::to_string(idx)); auto* v = symbol_table->GetValue(std::to_string(idx));
CHECK_NOTNULL(v); CHECK_NOTNULL(v);
auto* t = &v->get<phi::DenseTensor>(); auto* t = &v->get<::phi::DenseTensor>();
value_to_tensor_map[operand] = t; value_to_tensor_map[operand] = t;
// TODO(wilber): get input info from mlir. // TODO(wilber): get input info from mlir.
...@@ -93,7 +94,7 @@ namespace tensorrt { ...@@ -93,7 +94,7 @@ namespace tensorrt {
if (operand.isa<mlir::BlockArgument>()) { if (operand.isa<mlir::BlockArgument>()) {
// TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU // TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU
// tensor, so we treat all GPU tensors as inputs to trt. // tensor, so we treat all GPU tensors as inputs to trt.
if (t->place().GetType() == phi::AllocationType::GPU) { if (t->place().GetType() == ::phi::AllocationType::GPU) {
trt_bind_inputs[input_name] = t; trt_bind_inputs[input_name] = t;
nvinfer1::Dims dims; nvinfer1::Dims dims;
dims.nbDims = t->dims().size() - 1; dims.nbDims = t->dims().size() - 1;
...@@ -106,8 +107,10 @@ namespace tensorrt { ...@@ -106,8 +107,10 @@ namespace tensorrt {
} }
} else { } else {
// TODO(wilber): Replace with the op name that generates the weights. // TODO(wilber): Replace with the op name that generates the weights.
if (operand.getDefiningOp()->getName().getStringRef() != std::unordered_set<std::string> weight_flags{
"phi_dt.create_dense_tensor.cpu") { "phi_dt.tensor_map_get_tensor", "phi_dt.create_dense_tensor.cpu"};
if (!weight_flags.count(
operand.getDefiningOp()->getName().getStringRef().str())) {
trt_bind_inputs[input_name] = t; trt_bind_inputs[input_name] = t;
nvinfer1::Dims dims; nvinfer1::Dims dims;
dims.nbDims = t->dims().size() - 1; dims.nbDims = t->dims().size() - 1;
...@@ -167,10 +170,10 @@ void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) { ...@@ -167,10 +170,10 @@ void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) {
engine->GetEngineInfo(); engine->GetEngineInfo();
} }
std::vector<phi::DenseTensor*> TrtEngineCompute( std::vector<::phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) { backends::tensorrt::TrtEngine* engine, const ::phi::GPUContext& context) {
engine->Run(context); engine->Run(context);
std::vector<phi::DenseTensor*> res; std::vector<::phi::DenseTensor*> res;
for (size_t i = 0; i < engine->GetOutputNum(); ++i) { for (size_t i = 0; i < engine->GetOutputNum(); ++i) {
res.push_back(engine->GetOutput("output_" + std::to_string(i))); res.push_back(engine->GetOutput("output_" + std::to_string(i)));
} }
......
...@@ -41,8 +41,8 @@ struct MlirOperationWithInfrtSymbol { ...@@ -41,8 +41,8 @@ struct MlirOperationWithInfrtSymbol {
void PrintTrtLayer(backends::tensorrt::TrtEngine* engine); void PrintTrtLayer(backends::tensorrt::TrtEngine* engine);
std::vector<phi::DenseTensor*> TrtEngineCompute( std::vector<::phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context); backends::tensorrt::TrtEngine* engine, const ::phi::GPUContext& context);
} // namespace tensorrt } // namespace tensorrt
} // namespace kernel } // namespace kernel
......
...@@ -7,3 +7,4 @@ add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle ...@@ -7,3 +7,4 @@ add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir)
// RUN: infrtexec -i %s // RUN: infrtexec -i %s
module { module {
func @predict(%arg0: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> { func @predict(%arg0: !infrt.dense_tensor<CPU, FP32, NCHW>,%filter: !infrt.dense_tensor<CPU, FP32, NCHW>, %arg1: !infrt.dense_tensor<CPU, FP32, NCHW>, %arg2: !infrt.dense_tensor<CPU, FP32, NCHW>, %arg3: !infrt.dense_tensor<CPU, FP32, NCHW>, %arg4: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> {
%2 = "pd.abs"(%arg0) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> %2 = "pd.abs"(%arg0) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
infrt.return %2 : !infrt.dense_tensor<CPU, FP32, NCHW> %3 = "pd.matmul_v2"(%arg0, %2) {trans_x = false, trans_y = false} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
%4 = "pd.conv2d"(%3, %filter) {data_format = "NCHW", dilations = [1 : i32, 1 : i32], groups = 1 : si32, padding_algorithm = "EXPLICIT", paddings = [1 : i32, 1 : i32], strides = [2 : i32, 2 : i32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
%Y, %MeanOut, %VarianceOut = "pd.batch_norm"(%4, %arg1, %arg2, %arg3, %arg4) {data_layout = "NCHW", epsilon = 9.99999974E-6 : f32, momentum = 0.899999976 : f32} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
%out = "pd.relu"(%Y) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
%5 = "pd.elementwise_add"(%out, %out) {axis = -1:si32} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
infrt.return %5 : !infrt.dense_tensor<CPU, FP32, NCHW>
} }
func @main() { func @main() {
%ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU> %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
%t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>) %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[1, 3, 8, 8]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> () "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%2 = infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> %filter = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3, 3, 8, 8]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%filter) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%bias = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%bias) {value=[1.5:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%mean = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%mean) {value=[3.5:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%scale = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%scale) {value=[1.0:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%var = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%var) {value=[0.0:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%2 = infrt.call@predict(%t, %filter, %bias, %mean, %scale, %var) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>,!infrt.dense_tensor<CPU, FP32, NCHW>,!infrt.dense_tensor<CPU, FP32, NCHW>,!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
//phi_dt.print_tensor(%t : !infrt.dense_tensor<CPU, FP32, NCHW>)
phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>) phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
infrt.return infrt.return
} }
......
module {
func @main_graph(%map: !phi.dense_tensor_map, %arg0: !infrt.dense_tensor<CPU, FP32, ANY>) -> !infrt.dense_tensor<CPU, FP32, ANY> {
%0 = "phi_dt.create_context.gpu"() : () -> !phi.context<GPU>
%1 = "phi_dt.memcpy.gpu"(%arg0, %0) {d2h = false} : (!infrt.dense_tensor<CPU, FP32, ANY>, !phi.context<GPU>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%3 = phi_dt.tensor_map_get_tensor(%map) {name = "linear_0.b_0"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
%4 = phi_dt.tensor_map_get_tensor(%map) {name = "linear_0.w_0"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
%5 = "trt.create_engine"(%1, %4, %3) ( {
%10 = "trt.FullyConnected"(%1, %4, %3) {out_channel_num = 10 : si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
infrt.return %10 : !infrt.dense_tensor<GPU, FP32, NCHW>
}) {run_once = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
%6 = "trt.compute"(%5, %0) : (!trt.engine, !phi.context<GPU>) -> !infrt.tensor_list
%7 = "dt.tensor_list_get_tensor"(%6) {id = 0 : i32} : (!infrt.tensor_list) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%8 = "phi_dt.memcpy.gpu"(%7, %0) {d2h = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> !infrt.dense_tensor<CPU, FP32, ANY>
infrt.return %8 : !infrt.dense_tensor<CPU, FP32, ANY>
}
func @main() {
%map = phi_dt.load_combined_params(){model_path="@CMAKE_BINARY_DIR@/linear/linear.pdmodel",
params_path="@CMAKE_BINARY_DIR@/linear/linear.pdiparams"}
%ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
%input_tensor = "phi_dt.create_dense_tensor.cpu" (%ctx) {
precision=#infrt.precision<FP32>,
layout=#infrt.layout<NCHW>,
dims=[3:i64, 784:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%res = infrt.call @main_graph(%map, %input_tensor) {} : (!phi.dense_tensor_map, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
"phi_dt.print_tensor" (%res) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
infrt.return
}
}
...@@ -518,6 +518,30 @@ class PADDLE_API Tensor final { ...@@ -518,6 +518,30 @@ class PADDLE_API Tensor final {
/* Part 10: Auto generated Tensor methods */ /* Part 10: Auto generated Tensor methods */
/* Part 11: Methods of converting SparseTensor and DenseTensor to each other
*/
/**
* @brief Convert DenseTensor or SparseCsrTensor to SparseCooTensor
*
* @param sparse_dim, The number of sparse dimensions
* @return Tensor
*/
Tensor to_sparse_coo(const int64_t sparse_dim) const;
/**
* @brief Convert DenseTensor or SparseCooTensor to SparseCsrTensor
*
* @return Tensor
*/
Tensor to_sparse_csr() const;
/**
* @brief Convert SparseCooTensor or SparseCsrTensor to DenseTensor
*
* @return Tensor
*/
Tensor to_dense() const;
private: private:
/** /**
* [ Why use abstract TensorImpl interface here? ] * [ Why use abstract TensorImpl interface here? ]
......
...@@ -149,4 +149,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph ...@@ -149,4 +149,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph
cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl) cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api)
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_base.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/infermeta/unary.h" #include "paddle/phi/infermeta/unary.h"
...@@ -183,5 +184,17 @@ void Tensor::copy_(const Tensor &src, ...@@ -183,5 +184,17 @@ void Tensor::copy_(const Tensor &src,
} }
} }
Tensor Tensor::to_sparse_coo(const int64_t sparse_dim) const {
return experimental::sparse::to_sparse_coo(*this, sparse_dim);
}
Tensor Tensor::to_sparse_csr() const {
return experimental::sparse::to_sparse_csr(*this);
}
Tensor Tensor::to_dense() const {
return experimental::sparse::to_dense(*this);
}
} // namespace experimental } // namespace experimental
} // namespace paddle } // namespace paddle
...@@ -16,16 +16,18 @@ ...@@ -16,16 +16,18 @@
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include <ThreadPool.h>
namespace phi { namespace phi {
CallbackManager::CallbackManager(stream::Stream *stream) CallbackManager::CallbackManager(stream::Stream *stream)
: stream_(stream), thread_pool_(1) {} : stream_(stream), thread_pool_(new ::ThreadPool(1)) {}
void CallbackManager::AddCallback(std::function<void()> callback) const { void CallbackManager::AddCallback(std::function<void()> callback) const {
auto *callback_func = new std::function<void()>(std::move(callback)); auto *callback_func = new std::function<void()>(std::move(callback));
auto *func = new std::function<void()>([this, callback_func] { auto *func = new std::function<void()>([this, callback_func] {
std::lock_guard<std::mutex> lock(mtx_); std::lock_guard<std::mutex> lock(mtx_);
last_future_ = thread_pool_.enqueue([callback_func] { last_future_ = thread_pool_->enqueue([callback_func] {
std::unique_ptr<std::function<void()>> releaser(callback_func); std::unique_ptr<std::function<void()>> releaser(callback_func);
(*callback_func)(); (*callback_func)();
}); });
......
...@@ -14,8 +14,6 @@ ...@@ -14,8 +14,6 @@
#pragma once #pragma once
#include <ThreadPool.h>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -30,6 +28,8 @@ ...@@ -30,6 +28,8 @@
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
class ThreadPool;
namespace phi { namespace phi {
namespace stream { namespace stream {
...@@ -50,7 +50,7 @@ class CallbackManager { ...@@ -50,7 +50,7 @@ class CallbackManager {
private: private:
stream::Stream* stream_; stream::Stream* stream_;
mutable ::ThreadPool thread_pool_; mutable std::shared_ptr<::ThreadPool> thread_pool_;
mutable std::mutex mtx_; mutable std::mutex mtx_;
mutable std::future<void> last_future_; mutable std::future<void> last_future_;
}; };
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
#include <vector>
#include "paddle/phi/backends/event.h" #include "paddle/phi/backends/event.h"
#include "paddle/phi/backends/stream.h" #include "paddle/phi/backends/stream.h"
......
...@@ -124,6 +124,10 @@ class OpUtilsMap { ...@@ -124,6 +124,10 @@ class OpUtilsMap {
{std::move(op_type), std::move(base_kernel_name)}); {std::move(op_type), std::move(base_kernel_name)});
} }
bool HasArgumentMappingFn(const std::string& op_type) const {
return arg_mapping_fn_map_.count(op_type);
}
void InsertArgumentMappingFn(std::string op_type, ArgumentMappingFn fn) { void InsertArgumentMappingFn(std::string op_type, ArgumentMappingFn fn) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
arg_mapping_fn_map_.count(op_type), arg_mapping_fn_map_.count(op_type),
......
...@@ -832,6 +832,50 @@ void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out) { ...@@ -832,6 +832,50 @@ void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out) {
out->share_lod(*x.at(0)); out->share_lod(*x.at(0));
} }
void MultiplexInferMeta(const std::vector<MetaTensor*>& ins,
const MetaTensor& ids,
MetaTensor* out) {
PADDLE_ENFORCE_NE(
ins.empty(),
true,
phi::errors::InvalidArgument("MultiInput(X) shouldn't be empty."));
auto ids_dim = ids.dims();
PADDLE_ENFORCE_EQ(ids_dim.size(),
2,
phi::errors::PreconditionNotMet(
"The index tensor must be a vector with 2 dimensions"));
PADDLE_ENFORCE_EQ(
ids_dim[1],
1,
phi::errors::PreconditionNotMet(
"The index tensor must be a vector with batchSize x 1."));
auto ins_dims = GetMetaTensorsDim(ins);
auto num_ins = ins_dims.size();
PADDLE_ENFORCE_GT(
num_ins,
1,
phi::errors::InvalidArgument("multiplex operator should have more than "
"one candidate input tensors."));
auto in_dim = ins_dims[0];
PADDLE_ENFORCE_GE(
in_dim.size(),
2,
phi::errors::InvalidArgument(
"The rank of candidate tensors must be not less than 2."));
for (size_t i = 1; i < num_ins; i++) {
auto dim = ins_dims[i];
PADDLE_ENFORCE_EQ(
in_dim,
dim,
phi::errors::PreconditionNotMet(
"All the candidate tensors must have the same size."));
}
out->set_dims(in_dim);
out->set_dtype(ins[0]->dtype());
}
void PsroiPoolInferMeta(const MetaTensor& x, void PsroiPoolInferMeta(const MetaTensor& x,
const MetaTensor& rois, const MetaTensor& rois,
paddle::optional<const MetaTensor&> rois_num, paddle::optional<const MetaTensor&> rois_num,
......
...@@ -152,6 +152,10 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x, ...@@ -152,6 +152,10 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x,
void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out); void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out);
void MultiplexInferMeta(const std::vector<MetaTensor*>& ins,
const MetaTensor& ids,
MetaTensor* out);
void PsroiPoolInferMeta(const MetaTensor& x, void PsroiPoolInferMeta(const MetaTensor& x,
const MetaTensor& rois, const MetaTensor& rois,
paddle::optional<const MetaTensor&> rois_num, paddle::optional<const MetaTensor&> rois_num,
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/phi/common/type_traits.h" #include "paddle/phi/common/type_traits.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/kernels/funcs/parse_qr_mode.h"
#include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/unfold_functor.h" #include "paddle/phi/kernels/funcs/unfold_functor.h"
#include "paddle/phi/kernels/funcs/unsqueeze.h" #include "paddle/phi/kernels/funcs/unsqueeze.h"
...@@ -1129,6 +1130,44 @@ void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { ...@@ -1129,6 +1130,44 @@ void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
out->set_layout(x.layout()); out->set_layout(x.layout());
} }
void QrInferMeta(const MetaTensor& x,
const std::string& mode,
MetaTensor* q,
MetaTensor* r) {
auto x_dims = x.dims();
int x_rank = x_dims.size();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
phi::errors::InvalidArgument("the rank of input must greater than 2"));
bool compute_q;
bool reduced_mode;
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
std::tie(compute_q, reduced_mode) = phi::funcs::ParseQrMode(mode);
if (compute_q) {
int k = reduced_mode ? min_mn : m;
auto q_dims_vec = phi::vectorize(x_dims);
q_dims_vec[q_dims_vec.size() - 1] = k;
q->set_dims(phi::make_ddim(q_dims_vec));
} else {
q->set_dims(phi::make_ddim({0}));
}
int k = reduced_mode ? min_mn : m;
auto r_dims_vec = phi::vectorize(x_dims);
r_dims_vec[r_dims_vec.size() - 2] = k;
r_dims_vec[r_dims_vec.size() - 1] = n;
r->set_dims(phi::make_ddim(r_dims_vec));
q->share_lod(x);
r->share_lod(x);
q->set_dtype(x.dtype());
r->set_dtype(x.dtype());
}
DDim ReduceInferDim(const MetaTensor& x, DDim ReduceInferDim(const MetaTensor& x,
const std::vector<int64_t>& axis, const std::vector<int64_t>& axis,
bool keep_dim, bool keep_dim,
...@@ -1847,6 +1886,20 @@ void UnbindInferMeta(const MetaTensor& x, ...@@ -1847,6 +1886,20 @@ void UnbindInferMeta(const MetaTensor& x,
} }
} }
void TrilTriuInferMeta(const MetaTensor& x,
int diagonal,
bool lower,
MetaTensor* out) {
const auto& x_dims = x.dims();
PADDLE_ENFORCE_GE(x_dims.size(),
2,
phi::errors::InvalidArgument(
"Input(X)'s rank must be at least 2 in TrilTriuOp."));
out->set_dims(x.dims());
out->share_lod(x);
out->set_dtype(x.dtype());
}
void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) { void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
out->share_meta(x); out->share_meta(x);
} }
......
...@@ -180,6 +180,11 @@ void PoolInferMeta(const MetaTensor& x, ...@@ -180,6 +180,11 @@ void PoolInferMeta(const MetaTensor& x,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void QrInferMeta(const MetaTensor& x,
const std::string& mode,
MetaTensor* q,
MetaTensor* r);
void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
void ReduceInferMeta(const MetaTensor& x, void ReduceInferMeta(const MetaTensor& x,
...@@ -282,6 +287,11 @@ void TransposeGradInferMeta(const MetaTensor& x, ...@@ -282,6 +287,11 @@ void TransposeGradInferMeta(const MetaTensor& x,
const std::vector<int>& axis, const std::vector<int>& axis,
MetaTensor* out); MetaTensor* out);
void TrilTriuInferMeta(const MetaTensor& x,
int diagonal,
bool lower,
MetaTensor* out);
void UnbindInferMeta(const MetaTensor& x, void UnbindInferMeta(const MetaTensor& x,
int axis, int axis,
std::vector<MetaTensor>* outs); std::vector<MetaTensor>* outs);
......
...@@ -62,6 +62,8 @@ register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS $ ...@@ -62,6 +62,8 @@ register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS $
# phi sparse kernels # phi sparse kernels
add_subdirectory(sparse) add_subdirectory(sparse)
# phi selected_rows kernels
add_subdirectory(selected_rows)
copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
......
...@@ -45,3 +45,17 @@ PD_REGISTER_KERNEL(matmul_triple_grad, ...@@ -45,3 +45,17 @@ PD_REGISTER_KERNEL(matmul_triple_grad,
double, double,
phi::dtype::complex<float>, phi::dtype::complex<float>,
phi::dtype::complex<double>) {} phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(matmul_with_flatten_grad,
CPU,
ALL_LAYOUT,
phi::MatmulWithFlattenGradKernel,
float,
double) {}
PD_REGISTER_KERNEL(matmul_with_flatten_double_grad,
CPU,
ALL_LAYOUT,
phi::MatmulWithFlattenDoubleGradKernel,
float,
double) {}
...@@ -28,3 +28,10 @@ PD_REGISTER_KERNEL(matmul, ...@@ -28,3 +28,10 @@ PD_REGISTER_KERNEL(matmul,
double, double,
phi::dtype::complex<float>, phi::dtype::complex<float>,
phi::dtype::complex<double>) {} phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(matmul_with_flatten,
CPU,
ALL_LAYOUT,
phi::MatmulWithFlattenKernel,
float,
double) {}
...@@ -19,30 +19,10 @@ ...@@ -19,30 +19,10 @@
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/parse_qr_mode.h"
namespace phi { namespace phi {
static inline std::tuple<bool, bool> ParseQrMode(const std::string& mode) {
bool compute_q;
bool reduced;
if (mode == "reduced") {
compute_q = true;
reduced = true;
} else if (mode == "complete") {
compute_q = true;
reduced = false;
} else if (mode == "r") {
compute_q = false;
reduced = true;
} else {
PADDLE_THROW(errors::InvalidArgument(
"QR received unrecognized mode '%s'"
" but expected one of 'reduced' (default), 'r', or 'complete'",
mode));
}
return std::make_tuple(compute_q, reduced);
}
template <typename T, typename Context> template <typename T, typename Context>
void QrKernel(const Context& ctx, void QrKernel(const Context& ctx,
const DenseTensor& x, const DenseTensor& x,
...@@ -51,7 +31,7 @@ void QrKernel(const Context& ctx, ...@@ -51,7 +31,7 @@ void QrKernel(const Context& ctx,
DenseTensor* r) { DenseTensor* r) {
bool compute_q; bool compute_q;
bool reduced_mode; bool reduced_mode;
std::tie(compute_q, reduced_mode) = ParseQrMode(mode); std::tie(compute_q, reduced_mode) = phi::funcs::ParseQrMode(mode);
auto numel = x.numel(); auto numel = x.numel();
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
numel, 0, errors::PreconditionNotMet("The input of QR is empty.")); numel, 0, errors::PreconditionNotMet("The input of QR is empty."));
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace phi {
namespace funcs {
static inline std::tuple<bool, bool> ParseQrMode(const std::string& mode) {
bool compute_q;
bool reduced;
if (mode == "reduced") {
compute_q = true;
reduced = true;
} else if (mode == "complete") {
compute_q = true;
reduced = false;
} else if (mode == "r") {
compute_q = false;
reduced = true;
} else {
PADDLE_THROW(errors::InvalidArgument(
"QR received unrecognized mode '%s'"
" but expected one of 'reduced' (default), 'r', or 'complete'",
mode));
}
return std::make_tuple(compute_q, reduced);
}
} // namespace funcs
} // namespace phi
...@@ -453,25 +453,20 @@ struct ReduceConfig { ...@@ -453,25 +453,20 @@ struct ReduceConfig {
void SetReduceType() { void SetReduceType() {
int rank = x_dim.size(); int rank = x_dim.size();
int reduce_rank = reduce_dim.size(); int reduce_rank = reduce_dim.size();
bool is_last_dim =
(rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
if (rank == reduce_rank || is_last_dim) {
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
reduce_type = static_cast<int>(ReduceType::kReduceAny); bool not_higher = x_dim[0] > 1;
#else #else
reduce_type = static_cast<int>(ReduceType::kReduceLastDim); int device_id = paddle::platform::GetCurrentDeviceId();
int max_grid_z = phi::backends::gpu::GetGpuMaxGridDimSize(device_id)[2];
bool not_higher = x_dim[0] >= max_grid_z;
#endif #endif
if (reduce_last_dim && (reduce_rank == 1)) {
reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
} else if (reduce_rank == 1) { } else if (reduce_rank == 1) {
// ReduceFirstDim and reduceSecondDim reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
#ifdef PADDLE_WITH_XPU_KP if (rank == 3 && not_higher) {
if (reduce_dim[0] == 0) {
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
} else {
reduce_type = static_cast<int>(ReduceType::kReduceAny); reduce_type = static_cast<int>(ReduceType::kReduceAny);
} }
#else
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
#endif
} else { } else {
reduce_type = static_cast<int>(ReduceType::kReduceAny); reduce_type = static_cast<int>(ReduceType::kReduceAny);
} }
...@@ -648,7 +643,8 @@ __global__ void ReduceAnyKernel(const Tx* x, ...@@ -648,7 +643,8 @@ __global__ void ReduceAnyKernel(const Tx* x,
bool reduce_last_dim, bool reduce_last_dim,
const Calculator reduce_index_calculator, const Calculator reduce_index_calculator,
const Calculator left_index_calculator, const Calculator left_index_calculator,
const kps::DimConfig dim) { const kps::DimConfig dim,
bool is_mean) {
int input_idx, left_idx, stride; int input_idx, left_idx, stride;
int block_size = 0; int block_size = 0;
bool need_store = true; bool need_store = true;
...@@ -752,7 +748,9 @@ __global__ void ReduceAnyKernel(const Tx* x, ...@@ -752,7 +748,9 @@ __global__ void ReduceAnyKernel(const Tx* x,
kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>( kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
&reduce_var, &reduce_var, reducer, reduce_last_dim); &reduce_var, &reduce_var, reducer, reduce_last_dim);
if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(reduce_num);
}
Ty result = static_cast<Ty>(reduce_var); Ty result = static_cast<Ty>(reduce_var);
kps::details::WriteData<Ty>( kps::details::WriteData<Ty>(
y + store_offset + i, &result, static_cast<int>(need_store)); y + store_offset + i, &result, static_cast<int>(need_store));
...@@ -772,7 +770,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x, ...@@ -772,7 +770,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
int reduce_num, int reduce_num,
int left_num, int left_num,
int blocking_size, int blocking_size,
const kps::DimConfig dim) { const kps::DimConfig dim,
int mean_div,
bool is_mean) {
// when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
// function will be used // function will be used
auto block = ReduceIndexMapping<false>(dim); auto block = ReduceIndexMapping<false>(dim);
...@@ -806,6 +806,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x, ...@@ -806,6 +806,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
kps::details::ReduceMode::kLocalMode>( kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false); &reduce_var, &reduce_compute, reducer, false);
} }
if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(mean_div);
}
Ty result = static_cast<Ty>(reduce_var); Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, false>( kps::WriteData<Ty, 1, 1, 1, false>(
y + store_offset + idx, &result, block.BlockDimX()); y + store_offset + idx, &result, block.BlockDimX());
...@@ -831,6 +834,10 @@ __global__ void ReduceHigherDimKernel(const Tx* x, ...@@ -831,6 +834,10 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
kps::details::ReduceMode::kLocalMode>( kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false); &reduce_var, &reduce_compute, reducer, false);
} }
if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(mean_div);
}
Ty result = static_cast<Ty>(reduce_var); Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, true>( kps::WriteData<Ty, 1, 1, 1, true>(
y + store_offset + idx, &result, dim.rem_x); y + store_offset + idx, &result, dim.rem_x);
...@@ -848,7 +855,8 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -848,7 +855,8 @@ static void LaunchReduceKernel(const Tx* x_data,
const TransformOp& transform, const TransformOp& transform,
MPType init, MPType init,
KPStream stream, KPStream stream,
ReduceConfig<Ty> config) { ReduceConfig<Ty> config,
bool is_mean = false) {
if (config.reduce_type == kReduceLastDim) { if (config.reduce_type == kReduceLastDim) {
int stride_reduce = 1; int stride_reduce = 1;
int stride_left = config.reduce_num; int stride_left = config.reduce_num;
...@@ -887,7 +895,8 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -887,7 +895,8 @@ static void LaunchReduceKernel(const Tx* x_data,
config.reduce_last_dim, config.reduce_last_dim,
reduce_index_calculator, reduce_index_calculator,
left_index_calculator, left_index_calculator,
dim); dim,
is_mean && (!config.should_reduce_again));
} else { } else {
int reduce_rank = config.reduce_strides.size(); int reduce_rank = config.reduce_strides.size();
...@@ -930,7 +939,8 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -930,7 +939,8 @@ static void LaunchReduceKernel(const Tx* x_data,
config.reduce_last_dim, config.reduce_last_dim,
reduce_index_calculator, reduce_index_calculator,
left_index_calculator, left_index_calculator,
dim); dim,
is_mean && (!config.should_reduce_again));
} }
if (config.should_reduce_again) { if (config.should_reduce_again) {
...@@ -950,15 +960,18 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -950,15 +960,18 @@ static void LaunchReduceKernel(const Tx* x_data,
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0); dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
grid = 8; int grid_size = 8;
block = 64; int block_size = 64;
#else
auto grid_size = grid;
auto block_size = block;
#endif #endif
ReduceHigherDimKernel< ReduceHigherDimKernel<
Ty, Ty,
Ty, Ty,
MPType, MPType,
ReduceOp, ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>( kps::IdentityFunctor<Ty, MPType>><<<grid_size, block_size, 0, stream>>>(
config.output_data, config.output_data,
y_data, y_data,
reducer, reducer,
...@@ -967,7 +980,9 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -967,7 +980,9 @@ static void LaunchReduceKernel(const Tx* x_data,
config.grid.y, config.grid.y,
config.left_num, config.left_num,
config.grid.y, config.grid.y,
dim); dim,
config.reduce_num,
is_mean);
} }
} }
...@@ -1034,7 +1049,8 @@ void ReduceKernel(const KPDevice& dev_ctx, ...@@ -1034,7 +1049,8 @@ void ReduceKernel(const KPDevice& dev_ctx,
const phi::DenseTensor& x, const phi::DenseTensor& x,
phi::DenseTensor* y, phi::DenseTensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims) { const std::vector<int>& origin_reduce_dims,
bool is_mean = false) {
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
auto stream = dev_ctx.x_context()->xpu_stream; auto stream = dev_ctx.x_context()->xpu_stream;
#else #else
...@@ -1069,8 +1085,18 @@ void ReduceKernel(const KPDevice& dev_ctx, ...@@ -1069,8 +1085,18 @@ void ReduceKernel(const KPDevice& dev_ctx,
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
#ifndef PADDLE_WITH_XPU_KP #ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) { if (use_cub_reduce) {
CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>( if (is_mean) {
x_data, y_data, transform, config.reduce_num, dev_ctx, stream); using Div = kps::DivideFunctor<Tx>;
CubTensorReduceImpl<Tx, Ty, ReduceOp, Div>(x_data,
y_data,
Div(config.reduce_num),
config.reduce_num,
dev_ctx,
stream);
} else {
CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
x_data, y_data, transform, config.reduce_num, dev_ctx, stream);
}
return; return;
} }
#endif #endif
...@@ -1115,7 +1141,9 @@ void ReduceKernel(const KPDevice& dev_ctx, ...@@ -1115,7 +1141,9 @@ void ReduceKernel(const KPDevice& dev_ctx,
config.reduce_num, config.reduce_num,
config.left_num, config.left_num,
config.blocking_size, config.blocking_size,
dim); dim,
config.reduce_num,
is_mean && (!config.should_reduce_again));
if (config.should_reduce_again) { if (config.should_reduce_again) {
dim3 block = dim3(config.block.x, 1, 1); dim3 block = dim3(config.block.x, 1, 1);
...@@ -1125,15 +1153,19 @@ void ReduceKernel(const KPDevice& dev_ctx, ...@@ -1125,15 +1153,19 @@ void ReduceKernel(const KPDevice& dev_ctx,
dim2.SetRem(config.left_num % config.block.x, 0, 0); dim2.SetRem(config.left_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
grid = 8; int grid_size = 8;
block = 64; int block_size = 64;
#else
auto grid_size = grid;
auto block_size = block;
#endif #endif
ReduceHigherDimKernel< ReduceHigherDimKernel<
Ty, Ty,
Ty, Ty,
MPType, MPType,
ReduceOp<MPType>, ReduceOp<MPType>,
kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>( kps::IdentityFunctor<Ty,
MPType>><<<grid_size, block_size, 0, stream>>>(
config.output_data, config.output_data,
y_data, y_data,
reducer, reducer,
...@@ -1142,7 +1174,9 @@ void ReduceKernel(const KPDevice& dev_ctx, ...@@ -1142,7 +1174,9 @@ void ReduceKernel(const KPDevice& dev_ctx,
config.grid.y, config.grid.y,
config.left_num, config.left_num,
config.grid.y, config.grid.y,
dim2); dim2,
config.reduce_num,
is_mean);
} }
return; return;
} }
...@@ -1151,7 +1185,14 @@ void ReduceKernel(const KPDevice& dev_ctx, ...@@ -1151,7 +1185,14 @@ void ReduceKernel(const KPDevice& dev_ctx,
// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
// function will be used // function will be used
LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>( LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
x_data, y_data, reducer, transform, reducer.initial(), stream, config); x_data,
y_data,
reducer,
transform,
reducer.initial(),
stream,
config,
is_mean);
} }
} // namespace funcs } // namespace funcs
......
...@@ -49,3 +49,19 @@ PD_REGISTER_KERNEL(matmul_triple_grad, ...@@ -49,3 +49,19 @@ PD_REGISTER_KERNEL(matmul_triple_grad,
phi::dtype::float16, phi::dtype::float16,
phi::dtype::complex<float>, phi::dtype::complex<float>,
phi::dtype::complex<double>) {} phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(matmul_with_flatten_grad,
GPU,
ALL_LAYOUT,
phi::MatmulWithFlattenGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(matmul_with_flatten_double_grad,
GPU,
ALL_LAYOUT,
phi::MatmulWithFlattenDoubleGradKernel,
float,
double,
phi::dtype::float16) {}
...@@ -30,3 +30,11 @@ PD_REGISTER_KERNEL(matmul, ...@@ -30,3 +30,11 @@ PD_REGISTER_KERNEL(matmul,
phi::dtype::bfloat16, phi::dtype::bfloat16,
phi::dtype::complex<float>, phi::dtype::complex<float>,
phi::dtype::complex<double>) {} phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(matmul_with_flatten,
GPU,
ALL_LAYOUT,
phi::MatmulWithFlattenKernel,
float,
double,
phi::dtype::float16) {}
...@@ -30,7 +30,8 @@ void Reduce(const KPDevice& dev_ctx, ...@@ -30,7 +30,8 @@ void Reduce(const KPDevice& dev_ctx,
const std::vector<int64_t>& dims, const std::vector<int64_t>& dims,
bool keep_dim, bool keep_dim,
DataType out_dtype, DataType out_dtype,
DenseTensor* out) { DenseTensor* out,
bool is_mean = false) {
std::vector<int> reduce_dims = std::vector<int> reduce_dims =
phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all); phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all);
...@@ -57,12 +58,18 @@ void Reduce(const KPDevice& dev_ctx, ...@@ -57,12 +58,18 @@ void Reduce(const KPDevice& dev_ctx,
tmp_tensor, tmp_tensor,
out, out,
TransformOp<data_t, MPType>(reduce_num), TransformOp<data_t, MPType>(reduce_num),
reduce_dims); reduce_dims,
is_mean);
})); }));
} else { } else {
using MPType = typename kps::details::MPTypeTrait<T>::Type; using MPType = typename kps::details::MPTypeTrait<T>::Type;
phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>( phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
dev_ctx, x, out, TransformOp<T, MPType>(reduce_num), reduce_dims); dev_ctx,
x,
out,
TransformOp<T, MPType>(reduce_num),
reduce_dims,
is_mean);
} }
} }
} // namespace phi } // namespace phi
......
...@@ -27,8 +27,8 @@ void MeanRawKernel(const Context& dev_ctx, ...@@ -27,8 +27,8 @@ void MeanRawKernel(const Context& dev_ctx,
bool reduce_all, bool reduce_all,
DenseTensor* out) { DenseTensor* out) {
auto out_dtype = x.dtype(); auto out_dtype = x.dtype();
phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>( phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out, true);
} }
template <typename T, typename Context> template <typename T, typename Context>
......
...@@ -1731,4 +1731,163 @@ void MatmulTripleGradKernel(const Context& dev_ctx, ...@@ -1731,4 +1731,163 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
} }
} }
template <typename T, typename Context>
void MatmulWithFlattenGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* x_grad,
DenseTensor* y_grad) {
auto x_matrix = x.dims().size() > 2
? paddle::framework::ReshapeToMatrix(x, x_num_col_dims)
: x;
auto y_matrix = y.dims().size() > 2
? paddle::framework::ReshapeToMatrix(y, y_num_col_dims)
: y;
auto* dout = &out_grad;
DenseTensor dout_mat(*dout);
dout_mat.Resize({phi::flatten_to_2d(x.dims(), x_num_col_dims)[0],
phi::flatten_to_2d(y.dims(), y_num_col_dims)[1]});
auto* dx = x_grad;
auto* dy = y_grad;
if (dx != nullptr) {
dx->set_lod(x.lod());
}
if (dy != nullptr) {
dy->set_lod(y.lod());
}
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
if (dx) {
dev_ctx.template Alloc<T>(dx);
DenseTensor dx_matrix =
dx->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
}
if (dy) {
dev_ctx.template Alloc<T>(dy);
DenseTensor dy_matrix =
dy->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
}
}
template <typename T, typename Context>
void MatmulWithFlattenDoubleGradKernel(
const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
paddle::optional<const DenseTensor&> x_grad_grad,
paddle::optional<const DenseTensor&> y_grad_grad,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* x_grad,
DenseTensor* y_grad,
DenseTensor* out_grad_grad) {
auto x_mat = x.dims().size() > 2
? paddle::framework::ReshapeToMatrix(x, x_num_col_dims)
: x;
auto y_mat = y.dims().size() > 2
? paddle::framework::ReshapeToMatrix(y, y_num_col_dims)
: y;
const int m = phi::flatten_to_2d(x.dims(), x_num_col_dims)[0];
const int n = phi::flatten_to_2d(y.dims(), y_num_col_dims)[1];
auto* dout = &out_grad;
DenseTensor dout_mat(*dout);
dout_mat.Resize({m, n});
auto* ddx = x_grad_grad.get_ptr();
auto* ddy = y_grad_grad.get_ptr();
auto* dx = x_grad;
auto* dy = y_grad;
auto* ddout = out_grad_grad;
DenseTensor ddout_mat;
if (ddout) {
ddout->set_lod(dout->lod());
// allocate and reshape ddout
dev_ctx.template Alloc<T>(ddout);
ddout_mat.ShareDataWith(*ddout);
ddout_mat.Resize({m, n});
}
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
// a flag to specify whether ddout value has been set, if flag
// is false, MatMul beta should be 0 to set ddout, if flag is
// true, MatMul beta should be 1 to add result to ddout.
bool ddout_flag = false;
if (ddx) {
auto ddx_mat =
ddx->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*ddx, x_num_col_dims)
: static_cast<const DenseTensor&>(*ddx);
// dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
if (dy) {
dy->set_lod(y.lod());
// allocate and reshape dy
dev_ctx.template Alloc<T>(dy);
DenseTensor dy_mat =
dy->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
}
// ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
if (ddout) {
blas.MatMul(ddx_mat,
false,
y_mat,
false,
static_cast<T>(1.0),
&ddout_mat,
static_cast<T>(ddout_flag));
ddout_flag = true;
}
}
if (ddy) {
auto ddy_mat =
ddy->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*ddy, y_num_col_dims)
: static_cast<const DenseTensor&>(*ddy);
// dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
if (dx) {
dx->set_lod(x.lod());
// allocate and reshape dx
dev_ctx.template Alloc<T>(dx);
DenseTensor dx_mat =
dx->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
}
// ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
if (ddout) {
blas.MatMul(x_mat,
false,
ddy_mat,
false,
static_cast<T>(1.0),
&ddout_mat,
static_cast<T>(ddout_flag));
}
}
}
} // namespace phi } // namespace phi
...@@ -506,4 +506,34 @@ void MatmulKernel(const Context& dev_ctx, ...@@ -506,4 +506,34 @@ void MatmulKernel(const Context& dev_ctx,
MatMulFunction<Context, T>(dev_ctx, x, y, out, transpose_x, transpose_y); MatMulFunction<Context, T>(dev_ctx, x, y, out, transpose_x, transpose_y);
} }
template <typename T, typename Context>
void MatmulWithFlattenKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* out) {
const DenseTensor x_matrix =
x.dims().size() > 2
? paddle::framework::ReshapeToMatrix(x, x_num_col_dims)
: x;
const DenseTensor y_matrix =
y.dims().size() > 2
? paddle::framework::ReshapeToMatrix(y, y_num_col_dims)
: y;
dev_ctx.template Alloc<T>(out);
auto z_dim = out->dims();
if (z_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
blas.MatMul(x_matrix, y_matrix, out);
if (z_dim.size() != 2) {
out->Resize(z_dim);
}
}
} // namespace phi } // namespace phi
...@@ -60,4 +60,28 @@ void MatmulTripleGradKernel(const Context& dev_ctx, ...@@ -60,4 +60,28 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
DenseTensor* out_d_ddx, DenseTensor* out_d_ddx,
DenseTensor* out_d_ddy); DenseTensor* out_d_ddy);
template <typename T, typename Context>
void MatmulWithFlattenGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* x_grad,
DenseTensor* y_grad);
template <typename T, typename Context>
void MatmulWithFlattenDoubleGradKernel(
const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
paddle::optional<const DenseTensor&> x_grad_grad,
paddle::optional<const DenseTensor&> y_grad_grad,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* x_grad,
DenseTensor* y_grad,
DenseTensor* out_grad_grad);
} // namespace phi } // namespace phi
...@@ -29,6 +29,16 @@ void MatmulKernel(const Context& dev_ctx, ...@@ -29,6 +29,16 @@ void MatmulKernel(const Context& dev_ctx,
bool transpose_y, bool transpose_y,
DenseTensor* out); DenseTensor* out);
// In order to be compatible with `mul` op in fluid,
// it is no longer used in 2.x API
template <typename T, typename Context>
void MatmulWithFlattenKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
DenseTensor Matmul(const Context& dev_ctx, DenseTensor Matmul(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
......
set(SELECTED_ROWS_KERNEL_DEPS dense_tensor selected_rows sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel)
register_kernels(DEPS ${SELECTED_ROWS_KERNEL_DEPS} SUB_DIR "selected_rows_kernel")
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#endif #endif
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h" #include "paddle/phi/kernels/selected_rows/impl/isfinite_kernel_impl.h"
namespace phi { namespace phi {
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature MulGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("matmul_with_flatten_grad",
{"X", "Y", GradVarName("Out")},
{"x_num_col_dims", "y_num_col_dims"},
{GradVarName("X"), GradVarName("Y")});
}
KernelSignature MulDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("matmul_with_flatten_double_grad",
{"X", "Y", "DOut", "DDX", "DDY"},
{"x_num_col_dims", "y_num_col_dims"},
{"DX", "DY", "DDOut"});
}
} // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(mul, matmul_with_flatten);
PD_REGISTER_BASE_KERNEL_NAME(mul_grad, matmul_with_flatten_grad);
PD_REGISTER_BASE_KERNEL_NAME(mul_grad_grad, matmul_with_flatten_double_grad);
PD_REGISTER_ARG_MAPPING_FN(mul_grad, phi::MulGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(mul_grad_grad, phi::MulDoubleGradOpArgumentMapping);
...@@ -76,7 +76,7 @@ if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_ ...@@ -76,7 +76,7 @@ if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_
if not defined LOG_LEVEL set LOG_LEVEL=normal if not defined LOG_LEVEL set LOG_LEVEL=normal
if not defined PRECISION_TEST set PRECISION_TEST=OFF if not defined PRECISION_TEST set PRECISION_TEST=OFF
if not defined NIGHTLY_MODE set PRECISION_TEST=OFF if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
if not defined retry_times set retry_times=3 if not defined retry_times set retry_times=1
if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
if not defined BUILD_DIR set BUILD_DIR=build if not defined BUILD_DIR set BUILD_DIR=build
set task_name=%1 set task_name=%1
...@@ -234,7 +234,6 @@ set WITH_MKL=OFF ...@@ -234,7 +234,6 @@ set WITH_MKL=OFF
set WITH_GPU=OFF set WITH_GPU=OFF
set WITH_AVX=OFF set WITH_AVX=OFF
set MSVC_STATIC_CRT=ON set MSVC_STATIC_CRT=ON
set retry_times=1
set ON_INFER=OFF set ON_INFER=OFF
call :cmake || goto cmake_error call :cmake || goto cmake_error
...@@ -267,7 +266,6 @@ rem ------Build windows avx whl package------ ...@@ -267,7 +266,6 @@ rem ------Build windows avx whl package------
set WITH_AVX=ON set WITH_AVX=ON
set ON_INFER=OFF set ON_INFER=OFF
set CUDA_ARCH_NAME=All set CUDA_ARCH_NAME=All
set retry_times=4
call :cmake || goto cmake_error call :cmake || goto cmake_error
call :build || goto build_error call :build || goto build_error
...@@ -279,7 +277,6 @@ rem ------Build windows no-avx whl package------ ...@@ -279,7 +277,6 @@ rem ------Build windows no-avx whl package------
set WITH_AVX=OFF set WITH_AVX=OFF
set ON_INFER=OFF set ON_INFER=OFF
set CUDA_ARCH_NAME=All set CUDA_ARCH_NAME=All
set retry_times=4
call :cmake || goto cmake_error call :cmake || goto cmake_error
call :build || goto build_error call :build || goto build_error
......
...@@ -209,6 +209,9 @@ function cmake_base() { ...@@ -209,6 +209,9 @@ function cmake_base() {
-DWITH_MKL=${WITH_MKL:-ON} -DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF} -DWITH_AVX=${WITH_AVX:-OFF}
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF}
-DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF}
-DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF}
-DWITH_PYTHON=${WITH_PYTHON:-ON} -DWITH_PYTHON=${WITH_PYTHON:-ON}
-DCUDNN_ROOT=/usr/ -DCUDNN_ROOT=/usr/
-DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_TESTING=${WITH_TESTING:-ON}
...@@ -262,6 +265,9 @@ EOF ...@@ -262,6 +265,9 @@ EOF
-DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \
-DNOAVX_CORE_FILE=${NOAVX_CORE_FILE:-""} \ -DNOAVX_CORE_FILE=${NOAVX_CORE_FILE:-""} \
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \ -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} \
-DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} \
-DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} \
-DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \ -DCUDNN_ROOT=/usr/ \
-DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from .base_cost import OP_COST_FACTORY
from .base_cost import Cost
from .comm_op_cost import AllreduceSumCost
from .comp_op_cost import MatmulV2OpCost
from .tensor_cost import TensorCost
from .estimate_cost import CostEstimator
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from collections import OrderedDict
import paddle
COMM_OP_TYPE = [
"send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum"
]
NON_COMP_TYPE = ["while"] + COMM_OP_TYPE
OP_COST_FACTORY = {}
def _parse_op_to_desc(op, dist_context=None):
desc = {}
desc["op"] = op.type
vars = op.block.vars
input_desc = OrderedDict()
for input_name in op.input_names:
var_name_list = op.input(input_name)
var_desc = []
for var_name in var_name_list:
var = vars[var_name]
shape = None
if dist_context is not None:
dist_tensor = dist_context.get_dist_tensor_for_program(var)
shape = dist_tensor.local_sizes()
else:
shape = var.shape
assert shape is not None
var_desc.append((var.dtype, shape))
input_desc[input_name] = var_desc
desc["inputs"] = input_desc
output_desc = OrderedDict()
for out_name in op.output_names:
var_name_list = op.output(out_name)
var_desc = []
for var_name in var_name_list:
var = vars[var_name]
shape = None
if dist_context is not None:
dist_tensor = dist_context.get_dist_tensor_for_program(var)
shape = dist_tensor.local_sizes()
else:
shape = var.shape
assert shape is not None
var_desc.append((var.dtype, shape))
output_desc[out_name] = var_desc
desc["outputs"] = output_desc
attr_desc = op.all_attrs
desc["attrs"] = attr_desc
return desc
def parse_to_desc(op=None, dist_op=None, dist_context=None):
desc = None
if op is None and dist_op is not None and dist_context is not None:
desc = _parse_op_to_desc(
op=dist_op.serial_op, dist_context=dist_context)
elif op is not None and dist_op is None and dist_context is None:
desc = _parse_op_to_desc(op)
return desc
def parse_desc_to_str(desc):
def _parse_dtype(dtype):
dtype_str = ""
if dtype == paddle.float32:
dtype_str = "float32"
elif dtype == paddle.float16:
dtype_str = "float16"
elif dtype == paddle.int32:
dtype_str = "int32"
elif dtype == paddle.int64:
dtype_str = "int64"
elif dtype == paddle.unit8:
dtype_str = "unit8"
else:
raise TypeError("Unsupported dtype {}".format(dtype))
return dtype_str
assert isinstance(desc, dict)
desc_str_list = []
desc_str = None
dtype_str_list = []
dims_list = []
shape_list = []
desc_str_list.append(desc["op"])
inputs = desc["inputs"]
for key, item in inputs.items():
for dtype, shape in item:
dtype_str_list.append(_parse_dtype(dtype))
shape_list += list(shape)
dims = len(shape)
dims_list.append(dims)
dtype_str = "*".join(dtype_str_list)
dims_list = [str(item) for item in dims_list]
dims_str = "*".join(dims_list)
shape_list = [str(item) for item in shape_list]
shape_str = "[" + ",".join(shape_list) + "]"
desc_str_list += [dtype_str, dims_str, shape_str]
desc_str = "_".join(desc_str_list)
return desc_str
class CommContext:
_instance = None
_has_instance = False
def __init__(self, cluster):
if CommContext._has_instance:
return
self.cluster = cluster
self._alpha_base_ring = 8.4
self._alpha_base_tree = 0
self._alpha_inter = None
self._alpha_intra
self._beta = {}
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super().__new__(cls, *args, **kwargs)
_has_instance = True
return cls._instance
@property
def alpha_inter(self):
if self._alpha_inter is None:
if cluster.alpha.inter == "NVL":
self._alpha_inter = 3.4
elif cluster.alpha.inter == "PHB":
self._alpha_inter = 5.7
return self._alpha_inter
@property
def alpha_intra(self):
if self._alpha_intra is None:
if cluster.alpha.intra == "NVL":
self._alpha_intra = 28
elif cluster.alpha.intra == "PHB":
self._alpha_intra = 28
return self._alpha_intra
@property
def alpha_base_ring(self):
return self._alpha_base_ring
@property
def alpha_base_tree(self):
return self._alpha_base_tree
def get_beta(self, ranks):
key = ','.join(map(str, sorted(ranks)))
max_beta = None
if key in self._beta.keys:
max_beta = self._beta[key]
else:
for i in range(len(ranks)):
for j in range(i + 1, len(ranks)):
if min_beta == None:
min_beta = cluster.get_beta(ranks[i], ranks[j])
else:
beta = cluster.get_beta(ranks[i], ranks[j])
if beta > max_beta:
max_beta = beta
self._beta[key] = max_beta
return max_beta
class Cost:
def __init__(self, time=0, memory=0, flops=0):
self.time = time
self.memory = memory
self.flops = flops
def _check_time(self, val):
assert val >= 0, "Time must be greater than or equal to 0."
def _check_memory(self, val):
assert isinstance(
val, int) and val >= 0, "Memory must be int and greater than 0."
def _check_flops(self, val):
assert isinstance(
val, int) and val >= 0, "FLOPs must be int and greater than 0."
@property
def time(self):
return self._time
@time.setter
def time(self, val):
self._check_time(val)
self._time = val
@property
def memory(self):
return self._memory
@memory.setter
def memory(self, val):
self._check_memory(val)
self._memory = val
@property
def flops(self):
return self._flops
@flops.setter
def flops(self, val):
self._check_flops(val)
self._flops = val
def __add__(self, rhs):
assert isinstance(rhs, Cost)
time = self.time + rhs.time
memory = self.memory + rhs.memory
flops = self.flops + rhs.flops
assert (time >= 0 and memory >= 0 and flops >= 0)
return Cost(time, memory, flops)
def __sub__(self, rhs):
assert isinstance(rhs, Cost)
time = self.time - rhs.time
memory = self.memory - rhs.memory
flops = self.flops - rhs.flops
assert (time >= 0 and memory >= 0 and flops >= 0)
return Cost(time, memory, flops)
class OpCost:
def __init__(self, op=None, op_desc=None):
assert (op is not None and op_desc is None) or (op is None and
op_desc is not None)
self._op = op
self._op_desc = op_desc
self._cost = self.calc_cost()
@property
def op(self):
return self._op
@property
def op_desc(self):
return self._op_desc
@property
def cost(self):
return self._cost
def calc_time(self):
return 0
def calc_memory(self):
return 0
def calc_flops(self):
return 0
def calc_cost(self):
time = self.calc_time()
memory = self.calc_memory()
flops = self.calc_flops()
cost = Cost(time, memory, flops)
return cost
class CommOpCost(OpCost):
OP_TYPE = "COMM"
def __init__(self, op=None, op_desc=None, comm_context=None):
super(CommOpCost, self).__init__(op=op, op_desc=op_desc)
self._check_comm_op_type()
self._comm_context = comm_context
@property
def comm_context(self):
return self._comm_context
@classmethod
def _check_comm_op_type(cls):
if cls.OP_TYPE != "COMM":
if cls.OP_TYPE not in COMM_OP_TYPE:
raise TypeError("Please Check op type in {}, but got {}.".
format(COMM_OP_TYPE, cls.OP_TYPE))
class CompOpCost(OpCost):
OP_TYPE = "COMP"
def __init__(self, op=None, op_desc=None, cluster=None):
super(CompOpCost, self).__init__(op=op, op_desc=op_desc)
self._check_comp_op_type()
self.cluster = cluster
@classmethod
def _check_comp_op_type(cls):
if cls.OP_TYPE != "COMP":
if cls.OP_TYPE in NON_COMP_TYPE:
raise TypeError("Please Check op type not in {}, but got {}.".
format(NON_COMP_TYPE, cls.OP_TYPE))
def register_op_cost(cls):
op_type = cls.OP_TYPE
def register(op_type):
OP_COST_FACTORY[op_type] = cls
return register(op_type)
def calc_time_from_model(op=None, desc=None, cluster=None, comm_context=None):
op_type = op.type if op is not None else desc["op"]
if op_type in COMM_OP_TYPE:
op_cost = OP_COST_FACTORY[op_type](op=op,
op_desc=desc,
comm_context=comm_context)
elif op_type not in NON_COMP_TYPE:
op_cost = OP_COST_FACTORY[op_type](op=op, op_desc=desc, cluster=cluster)
time = op_cost.calc_time()
return time
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from .base_cost import register_op_cost, CommOpCost, OP_COST_FACTORY
@register_op_cost
class AllreduceSumCost(CommOpCost):
OP_TYPE = "c_allreduce_sum"
def __init__(self, op=None, op_desc=None, comm_context=None):
super(OP_COST_FACTORY["c_allreduce_sum"], self).__init__(
op=op, op_desc=op_desc, comm_context=comm_context)
def calc_time(self):
# NOTE: The actual formula will be filled in the future.
return 0
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from .base_cost import Cost, register_op_cost, CompOpCost, OP_COST_FACTORY
@register_op_cost
class MatmulV2OpCost(CompOpCost):
OP_TYPE = "matmul_v2"
def __init__(self, op=None, op_desc=None, cluster=None):
super(OP_COST_FACTORY["matmul_v2"], self).__init__(
op=op, op_desc=op_desc, cluster=cluster)
# For a concrete COMP OP, the calc_time and calc_flops function needs to be overrided
def calc_flops(self):
# NOTE: The actual formula will be filled in the future
return 0
def calc_time(self):
# NOTE: The actual formula will be filled in the future
return 0
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
class CostEstimator:
def __init__(self,
program,
cluster=None,
dist_context=None,
mode="modeling"):
self._program = program
self._cluster = cluster
self._dist_context = dist_context
self._check_mode(mode)
self._mode = mode
self._global_cost = None
self._local_cost = {}
@property
def program(self):
return self._program
@property
def dist_context(self):
return self._dist_context
@property
def cluster(self):
return self._cluster
@property
def mode(self):
return self._mode
@property
def global_cost(self):
return self._global_cost
@property
def local_cost(self):
return self._local_cost
def get_op_cost(self):
return 0
def get_tensor_cost(self):
return 0
def get_global_cost(self):
return 0
def get_local_cost(self, rank=None):
return 0
def _check_mode(self, mode):
if mode not in ["modeling", "profiling"]:
raise ValueError(
"Just support modeling and profiling, but got {}".format(mode))
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from functools import reduce
import paddle
from paddle.fluid.framework import Variable
from paddle.distributed.auto_parallel.dist_tensor import DistributedTensor
from .base_cost import Cost
class TensorCost:
def __init__(self, tensor=None, dist_tensor=None, shape=None, dtype=None):
self._check_args(tensor, dist_tensor, shape, dtype)
self._tensor = tensor
self._dist_tensor = dist_tensor
self._shape = shape
self._dtype = dtype
self._cost = self.calc_cost()
@property
def tensor(self):
return self._tensor
@property
def dist_tensor(self):
return self._dist_tensor
@property
def shape(self):
return self._shape
@property
def dtype(self):
return self._dtype
def _check_args(self, tensor, dist_tensor, shape, dtype):
if tensor is not None:
assert (shape is None and dist_tensor is None and dtype is None)
if not isinstance(tensor, Variable):
raise TypeError(
"Please check tensor type is Variable, but got {}".format(
type(tensor)))
elif dist_tensor is not None:
assert (tensor is None and shape is None)
if not isinstance(dist_tensor, DistributedTensor):
raise TypeError(
"Please check dist_tensor type is DistributedTensor, but got {}".
format(type(dist_tensor)))
elif shape is not None:
assert (tensor is None and dist_tensor is None and
dtype is not None)
if not isinstance(shape, (list, set)):
raise TypeError(
"Please check shape type is list or set, but got {}".format(
type(shape)))
elif dtype is not None:
assert (tensor is None and dist_tensor is None and
shape is not None)
@property
def cost(self):
return self._cost
def calc_cost(self):
dtype = None
shape = None
if self.dist_tensor:
shape = self.dist_tensor.local_sizes()
dtype = self.dist_tensor.serial_tensor.dtype
elif self.tensor:
shape = self.tensor.shape
dtype = self.tensor.dtype
elif self.shape and self.dtype:
shape = self.shape
dtype = self.dtype
total_count = reduce(lambda x, y: x * y, shape)
if dtype == paddle.float32 or dtype == paddle.int32:
dtype_factor = 4
elif node.dtype == paddle.int64:
dtype_factor = 8
elif node.dtype == paddle.uint8:
dtype_factor = 1
else:
dtype_factor = 2
memory = total_count * dtype_factor
assert memory >= 0
cost = Cost(memory=memory)
return cost
...@@ -242,7 +242,8 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra ...@@ -242,7 +242,8 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
elastic_group.add_argument( elastic_group.add_argument(
"--force", type=bool, default=False, help="update np force") "--force", type=bool, default=False, help="update np force")
return parser.parse_args() known_args, _ = parser.parse_known_args()
return known_args
def get_cluster_from_args(args, device_mode, devices_per_proc): def get_cluster_from_args(args, device_mode, devices_per_proc):
......
...@@ -25,12 +25,13 @@ class Context(object): ...@@ -25,12 +25,13 @@ class Context(object):
def __init__(self, enable_plugin=True): def __init__(self, enable_plugin=True):
self.args, self.unknown_args = parse_args() self.args, self.unknown_args = parse_args()
self.envs = fetch_envs() self.envs = fetch_envs()
self.logger = self.get_logger()
self.set_env_in_args()
self.node = Node() self.node = Node()
self.status = Status() self.status = Status()
self.set_env_in_args() self.logger = self.get_logger()
# design for event queue, later # design for event queue, later
self.events = [] self.events = []
......
...@@ -57,7 +57,7 @@ class Device(object): ...@@ -57,7 +57,7 @@ class Device(object):
else: else:
self._labels = [] self._labels = []
def get_selected_flag_key(self): def get_selected_device_key(self):
if self._dtype == DeviceType.CPU: if self._dtype == DeviceType.CPU:
return 'FLAGS_selected_cpus' return 'FLAGS_selected_cpus'
if self._dtype == DeviceType.GPU: if self._dtype == DeviceType.GPU:
...@@ -70,19 +70,15 @@ class Device(object): ...@@ -70,19 +70,15 @@ class Device(object):
return 'FLAGS_selected_mlus' return 'FLAGS_selected_mlus'
return 'FLAGS_selected_devices' return 'FLAGS_selected_devices'
def get_selected_flag_label(self, idx): def get_selected_devices(self, devices=''):
if idx < len(self._labels): '''
return self._labels[idx] return the device label/id relative to the visible devices
'''
if not devices:
return [str(x) for x in range(0, len(self._labels))]
else: else:
return '0' devs = [x.strip() for x in devices.split(',')]
return [str(self._labels.index(d)) for d in devs]
def selected_flags(self, idx=None):
if idx is None:
return {self.get_selected_flag_key(): ','.join(self._labels)}
else:
return {
self.get_selected_flag_key(): self.get_selected_flag_label(idx)
}
@classmethod @classmethod
def parse_device(self): def parse_device(self):
......
...@@ -75,6 +75,9 @@ class CollectiveController(Controller): ...@@ -75,6 +75,9 @@ class CollectiveController(Controller):
job_endpoints = [i['endpoints'] for i in peer_list] job_endpoints = [i['endpoints'] for i in peer_list]
self.pod.reset() self.pod.reset()
selected_dev_key = self.ctx.node.device.get_selected_device_key()
selected_dev_list = self.ctx.node.device.get_selected_devices(
self.ctx.args.devices)
for i in range(self.pod.replicas): for i in range(self.pod.replicas):
e = { e = {
"PADDLE_MASTER": collective_master, "PADDLE_MASTER": collective_master,
...@@ -90,9 +93,9 @@ class CollectiveController(Controller): ...@@ -90,9 +93,9 @@ class CollectiveController(Controller):
"PADDLE_RANK_IN_NODE": str(i), "PADDLE_RANK_IN_NODE": str(i),
} }
if self.pod.replicas == 1: if self.pod.replicas == 1:
e.update(self.ctx.node.device.selected_flags()) e.update({selected_dev_key: selected_dev_list})
else: else:
e.update(self.ctx.node.device.selected_flags(i)) e.update({selected_dev_key: selected_dev_list[i]})
self.add_container(envs=e, log_tag=i) self.add_container(envs=e, log_tag=i)
return True return True
......
...@@ -210,6 +210,8 @@ class Controller(ControllerBase): ...@@ -210,6 +210,8 @@ class Controller(ControllerBase):
if self.ctx.args.nproc_per_node: if self.ctx.args.nproc_per_node:
return int(self.ctx.args.nproc_per_node) return int(self.ctx.args.nproc_per_node)
elif self.ctx.args.devices:
return len(self.ctx.args.devices.split(','))
else: else:
return self.ctx.node.device.count return self.ctx.node.device.count
......
...@@ -29,8 +29,9 @@ def process_args(ctx): ...@@ -29,8 +29,9 @@ def process_args(ctx):
#argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
argdev = ctx.args.devices argdev = ctx.args.devices
if argdev: if argdev:
ctx.node.device.labels = argdev.split(',') for d in argdev.split(','):
ctx.logger.debug('Device reset by args {}'.format(argdev)) assert d in ctx.node.device.labels, 'Device not found {}'.format(
argdev)
def collective_compatible(ctx): def collective_compatible(ctx):
......
...@@ -22,6 +22,10 @@ from paddle.fluid.framework import program_guard, device_guard ...@@ -22,6 +22,10 @@ from paddle.fluid.framework import program_guard, device_guard
from paddle.fluid import unique_name, layers from paddle.fluid import unique_name, layers
from paddle.fluid.clip import append_gradient_clip_ops from paddle.fluid.clip import append_gradient_clip_ops
from .pass_base import PassBase, PassType, register_pass from .pass_base import PassBase, PassType, register_pass
from paddle.distributed.auto_parallel.utils import set_var_dist_attr
from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping
from paddle.distributed.auto_parallel.process_group import get_world_process_group
world_process_group = get_world_process_group()
def _is_the_backward_op(op): def _is_the_backward_op(op):
...@@ -68,15 +72,11 @@ def _remove_and_get_optimizer_op(main_program, dist_context): ...@@ -68,15 +72,11 @@ def _remove_and_get_optimizer_op(main_program, dist_context):
def _remove_op_role_var(param, grad): def _remove_op_role_var(param, grad):
op_maker = core.op_proto_and_checker_maker op_maker = core.op_proto_and_checker_maker
op = grad.op op = grad.op
assert _is_the_backward_op(op), \
'grad.op={} is not the backward op which produces the grad={}' \
.format(op, grad.name)
if op.has_attr(op_maker.kOpRoleVarAttrName()): if op.has_attr(op_maker.kOpRoleVarAttrName()):
op._remove_attr(op_maker.kOpRoleVarAttrName()) op._remove_attr(op_maker.kOpRoleVarAttrName())
def _get_gm_cond_var(main_program, k_steps): def _get_gm_cond_var(main_program, k_steps, dist_context):
main_block = main_program.global_block() main_block = main_program.global_block()
# Add const var # Add const var
k_step_var = layers.create_global_var( k_step_var = layers.create_global_var(
...@@ -86,6 +86,7 @@ def _get_gm_cond_var(main_program, k_steps): ...@@ -86,6 +86,7 @@ def _get_gm_cond_var(main_program, k_steps):
dtype='int32', dtype='int32',
persistable=True, persistable=True,
force_cpu=True) force_cpu=True)
set_var_dist_attr(dist_context, k_step_var, [-1], world_process_group.ranks)
zero_var = layers.create_global_var( zero_var = layers.create_global_var(
name="gradient_merge_zero", name="gradient_merge_zero",
...@@ -94,6 +95,7 @@ def _get_gm_cond_var(main_program, k_steps): ...@@ -94,6 +95,7 @@ def _get_gm_cond_var(main_program, k_steps):
dtype='int32', dtype='int32',
persistable=True, persistable=True,
force_cpu=True) force_cpu=True)
set_var_dist_attr(dist_context, zero_var, [-1], world_process_group.ranks)
# Add step var & cond var # Add step var & cond var
step_var = layers.create_global_var( step_var = layers.create_global_var(
...@@ -103,6 +105,7 @@ def _get_gm_cond_var(main_program, k_steps): ...@@ -103,6 +105,7 @@ def _get_gm_cond_var(main_program, k_steps):
dtype='int32', dtype='int32',
persistable=True, persistable=True,
force_cpu=True) force_cpu=True)
set_var_dist_attr(dist_context, step_var, [-1], world_process_group.ranks)
cond_var = layers.create_global_var( cond_var = layers.create_global_var(
name="gradient_merge_cond", name="gradient_merge_cond",
...@@ -111,24 +114,29 @@ def _get_gm_cond_var(main_program, k_steps): ...@@ -111,24 +114,29 @@ def _get_gm_cond_var(main_program, k_steps):
dtype='bool', dtype='bool',
persistable=False, persistable=False,
force_cpu=True) force_cpu=True)
set_var_dist_attr(dist_context, cond_var, [-1], world_process_group.ranks)
with device_guard("cpu"): with device_guard("cpu"):
# step_var = (step_var + 1) % k_step # step_var = (step_var + 1) % k_step
layers.increment(x=step_var, value=1.0, in_place=True) layers.increment(x=step_var, value=1.0, in_place=True)
main_block.append_op( elementwise_mod_op = main_block.append_op(
type='elementwise_mod', type='elementwise_mod',
inputs={'X': step_var, inputs={'X': step_var,
'Y': k_step_var}, 'Y': k_step_var},
outputs={'Out': step_var}, outputs={'Out': step_var},
attrs={'axis': -1, attrs={'axis': -1,
'use_mkldnn': False}) 'use_mkldnn': False})
naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
elementwise_mod_op, world_process_group.ranks, [-1], dist_context)
# cond_var = (step_var == 0) # cond_var = (step_var == 0)
main_block.append_op( equal_op = main_block.append_op(
type='equal', type='equal',
inputs={'X': step_var, inputs={'X': step_var,
'Y': zero_var}, 'Y': zero_var},
outputs={'Out': cond_var}) outputs={'Out': cond_var})
naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
equal_op, world_process_group.ranks, [-1], dist_context)
return cond_var return cond_var
...@@ -137,7 +145,8 @@ def _append_gradient_merge_backward_op( ...@@ -137,7 +145,8 @@ def _append_gradient_merge_backward_op(
main_program, main_program,
startup_program, startup_program,
params_grads: List[Tuple[Any, Any]], params_grads: List[Tuple[Any, Any]],
cond_var_name: str) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]: cond_var_name: str,
dist_context) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]:
main_block = main_program.global_block() main_block = main_program.global_block()
startup_block = startup_program.global_block() startup_block = startup_program.global_block()
...@@ -156,12 +165,19 @@ def _append_gradient_merge_backward_op( ...@@ -156,12 +165,19 @@ def _append_gradient_merge_backward_op(
param_name = param.name param_name = param.name
param_var = main_block.var(param_name) param_var = main_block.var(param_name)
assert (param_var is not None) assert (param_var is not None)
ref_dist_attr = dist_context.get_tensor_dist_attr_for_program(param_var)
assert ref_dist_attr is not None
gradient_merge_var = main_block.create_var( gradient_merge_var = main_block.create_var(
name=param_name + "@GRAD@GradientMerge", name=param_name + "@GRAD@GradientMerge",
shape=param_var.shape, shape=param_var.shape,
dtype=param_var.dtype, dtype=param_var.dtype,
persistable=True) persistable=True)
param_to_gradient_merge[param_name] = gradient_merge_var param_to_gradient_merge[param_name] = gradient_merge_var
ref_process_mesh = ref_dist_attr.process_mesh
ref_dims_mapping = ref_dist_attr.dims_mapping
set_var_dist_attr(dist_context, gradient_merge_var, ref_dims_mapping,
ref_process_mesh)
startup_gradient_merge_var = startup_block.create_var( startup_gradient_merge_var = startup_block.create_var(
name=param_name + "@GRAD@GradientMerge", name=param_name + "@GRAD@GradientMerge",
...@@ -186,6 +202,8 @@ def _append_gradient_merge_backward_op( ...@@ -186,6 +202,8 @@ def _append_gradient_merge_backward_op(
attrs={'axis': -1, attrs={'axis': -1,
'use_mkldnn': False}) 'use_mkldnn': False})
new_params_to_grads.append([param, gradient_merge_var]) new_params_to_grads.append([param, gradient_merge_var])
naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
new_grad_op, ref_process_mesh, ref_dims_mapping, dist_context)
return new_params_to_grads, param_to_gradient_merge return new_params_to_grads, param_to_gradient_merge
...@@ -240,7 +258,7 @@ def _create_cond_block_and_update_optimizer( ...@@ -240,7 +258,7 @@ def _create_cond_block_and_update_optimizer(
new_op_desc.remove_attr(op_maker.kOpRoleVarAttrName()) new_op_desc.remove_attr(op_maker.kOpRoleVarAttrName())
# op's update Grad # op's update Grad
if new_op_desc.input("Grad"): if core.grad_var_suffix() in new_op_desc.input_arg_names():
grad_value = new_op_desc.input("Grad")[0] grad_value = new_op_desc.input("Grad")[0]
# TODO FIXME(xym) support fp16 # TODO FIXME(xym) support fp16
grad_merge_value = grad_value + '@GradientMerge' grad_merge_value = grad_value + '@GradientMerge'
...@@ -265,7 +283,7 @@ def _create_cond_block_and_update_optimizer( ...@@ -265,7 +283,7 @@ def _create_cond_block_and_update_optimizer(
def parse_program(main_program, startup_program, params_grads, k_steps, avg, def parse_program(main_program, startup_program, params_grads, k_steps, avg,
dist_context): dist_context):
# 1 create gradient_merge_cond # 1 create gradient_merge_cond
cond_var = _get_gm_cond_var(main_program, k_steps) cond_var = _get_gm_cond_var(main_program, k_steps, dist_context)
# 2 remove optimizer_op from main_program # 2 remove optimizer_op from main_program
optimize_ops_desc = _remove_and_get_optimizer_op(main_program, dist_context) optimize_ops_desc = _remove_and_get_optimizer_op(main_program, dist_context)
...@@ -275,7 +293,8 @@ def parse_program(main_program, startup_program, params_grads, k_steps, avg, ...@@ -275,7 +293,8 @@ def parse_program(main_program, startup_program, params_grads, k_steps, avg,
# 3 append gradient merge backward op to main_program # 3 append gradient merge backward op to main_program
new_params_to_grads, param_to_gradient_merge = _append_gradient_merge_backward_op( new_params_to_grads, param_to_gradient_merge = _append_gradient_merge_backward_op(
main_program, startup_program, params_grads, cond_var.name) main_program, startup_program, params_grads, cond_var.name,
dist_context)
# 4 create ConditionalBlock and append gradient merge optimizer ops # 4 create ConditionalBlock and append gradient merge optimizer ops
_create_cond_block_and_update_optimizer( _create_cond_block_and_update_optimizer(
......
...@@ -97,7 +97,9 @@ class Communicator(object): ...@@ -97,7 +97,9 @@ class Communicator(object):
recv_ctx, recv_ctx,
proto_txt, proto_txt,
unit64_hosts, unit64_hosts,
scope=global_scope()): scope=None):
if scope == None:
scope = global_scope()
self.communicator_ = core.DistCommunicator(self.mode, proto_txt, self.communicator_ = core.DistCommunicator(self.mode, proto_txt,
unit64_hosts, send_ctx, unit64_hosts, send_ctx,
recv_ctx, scope, self.envs) recv_ctx, scope, self.envs)
...@@ -191,7 +193,9 @@ class Communicator(object): ...@@ -191,7 +193,9 @@ class Communicator(object):
def pull_dense(self, context): def pull_dense(self, context):
self.communicator_.pull_dense(context) self.communicator_.pull_dense(context)
def push_sparse_param(self, var_name, table_id=-1, scope=global_scope()): def push_sparse_param(self, var_name, table_id=-1, scope=None):
if scope == None:
scope = global_scope()
if not self.is_running(): if not self.is_running():
raise ValueError( raise ValueError(
"Communicator should init first. Using fleet.init_worker() before push_sparse_param()" "Communicator should init first. Using fleet.init_worker() before push_sparse_param()"
......
...@@ -105,9 +105,8 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''): ...@@ -105,9 +105,8 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''):
if not isinstance(expected_type, tuple): if not isinstance(expected_type, tuple):
expected_type = (expected_type, ) expected_type = (expected_type, )
expected_type += (core.VarBase, ) expected_type += (core.VarBase, )
# TODO(jiabin): uncomment it when we support declarative mode in eager if core._in_eager_mode():
# if _in_eager_mode(): expected_type += (core.eager.Tensor, )
# expected_type += (core.eager.Tensor, )
elif isinstance(input, core.VarBase): elif isinstance(input, core.VarBase):
raise TypeError( raise TypeError(
"Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. " "Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. "
......
...@@ -17,4 +17,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU) ...@@ -17,4 +17,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS}) py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS})
py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS})
py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS}) py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS})
py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS})
endif() endif()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import paddle.distributed.auto_parallel.cost as cost_model
from paddle.distributed.auto_parallel.cost.base_cost import parse_to_desc
from paddle.distributed.auto_parallel.cost.base_cost import parse_desc_to_str
from paddle.distributed.auto_parallel.cost.base_cost import calc_time_from_model
paddle.enable_static()
def check_cost(cost):
if cost.memory >= 0 and cost.flops >= 0 and cost.time >= 0:
return True
return False
class TestCost(unittest.TestCase):
def test_base_cost(self):
cost = cost_model.Cost(memory=100, flops=200, time=0.5)
self.assertTrue(check_cost(cost))
def test_comp_cost(self):
x = paddle.static.data(name="x", shape=[20, 20], dtype='float32')
y = paddle.static.data(name="y", shape=[20, 20], dtype='float32')
z = paddle.matmul(x, y)
matmul_v2_op = None
ops = paddle.static.default_main_program().global_block().ops
for op in ops:
if op.type == "matmul_v2":
matmul_v2_op = op
break
matmul_v2_cost = cost_model.OP_COST_FACTORY["matmul_v2"](
op=matmul_v2_op)
desc = parse_to_desc(op=matmul_v2_op)
desc_str = parse_desc_to_str(desc)
self.assertIsNotNone(desc_str)
self.assertTrue(check_cost(matmul_v2_cost.cost))
time = calc_time_from_model(op=matmul_v2_op)
self.assertEqual(time, matmul_v2_cost.cost.time)
tensor_cost = cost_model.TensorCost(tensor=x)
# check memory
self.assertEqual(tensor_cost.cost.memory, 1600)
def test_comm_cost(self):
desc = {}
desc["op"] = "c_allreduce_sum"
desc["inputs"] = {"X": [([100, 200], paddle.float32)]}
allreduce_cost = cost_model.OP_COST_FACTORY["c_allreduce_sum"](
op_desc=desc)
self.assertTrue(check_cost(allreduce_cost.cost))
def test_cost_estimator(self):
train_program = paddle.static.Program()
cost_estimator = cost_model.CostEstimator(train_program)
self.assertIsNotNone(cost_estimator)
if __name__ == "__main__":
unittest.main()
...@@ -31,6 +31,7 @@ from paddle.fluid.initializer import NumpyArrayInitializer ...@@ -31,6 +31,7 @@ from paddle.fluid.initializer import NumpyArrayInitializer
from paddle.distributed.passes import new_pass, PassManager, PassContext from paddle.distributed.passes import new_pass, PassManager, PassContext
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
from dist_pass_test_base import DistPassTestBase from dist_pass_test_base import DistPassTestBase
from paddle.distributed.auto_parallel.dist_context import DistributedContext
logging.getLogger().setLevel(logging.INFO) logging.getLogger().setLevel(logging.INFO)
paddle.enable_static() paddle.enable_static()
...@@ -111,14 +112,20 @@ class TestGradientMergePass(DistPassTestBase): ...@@ -111,14 +112,20 @@ class TestGradientMergePass(DistPassTestBase):
def init(self): def init(self):
self._params_grads = None self._params_grads = None
self._config = {"k_steps": 4, "avg": True} self._config = {"k_steps": 4, "avg": True}
#self._config["dist_context"] = DistributedContext()
def apply_passes(self, main_prog, startup_prog): def apply_passes(self, main_prog, startup_prog):
self._config["params_grads"] = self._params_grads #self._config["params_grads"] = self._params_grads
pass_context = PassContext() #pass_context = PassContext()
auto_parallel_gradient_merge_pass = new_pass( #auto_parallel_gradient_merge_pass = new_pass(
"auto_parallel_gradient_merge_pass", self._config) # "auto_parallel_gradient_merge_pass", self._config)
auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog], #auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog],
pass_context) # pass_context)
dist_strategy = fleet.DistributedStrategy()
dist_strategy.gradient_merge = True
dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
dist_strategy.semi_auto = True
fleet.init(is_collective=True, strategy=dist_strategy)
def test_result(self): def test_result(self):
no_pass_rets = self._distributed_launch( no_pass_rets = self._distributed_launch(
...@@ -135,7 +142,7 @@ class TestGradientMergePass(DistPassTestBase): ...@@ -135,7 +142,7 @@ class TestGradientMergePass(DistPassTestBase):
gradient_merge=True, gradient_merge=True,
batch_size=8, batch_size=8,
max_step=8) max_step=8)
"""
# avg loss for gradient_merge pass # avg loss for gradient_merge pass
avg_loss = 0 avg_loss = 0
pass_avg_ret_list = [] pass_avg_ret_list = []
...@@ -156,6 +163,7 @@ class TestGradientMergePass(DistPassTestBase): ...@@ -156,6 +163,7 @@ class TestGradientMergePass(DistPassTestBase):
rtol=self.rtol, rtol=self.rtol,
atol=self.atol, atol=self.atol,
equal_nan=self.equal_nan)) equal_nan=self.equal_nan))
"""
def get_model(self, place, gradient_merge, batch_size, max_step): def get_model(self, place, gradient_merge, batch_size, max_step):
paddle.seed(2021) paddle.seed(2021)
......
...@@ -20,6 +20,7 @@ import unittest ...@@ -20,6 +20,7 @@ import unittest
import paddle import paddle
from paddle.fluid.dygraph.jit import declarative from paddle.fluid.dygraph.jit import declarative
from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
import paddle.fluid.core as core
from ifelse_simple_func import * from ifelse_simple_func import *
...@@ -379,7 +380,7 @@ class TestDy2StIfElseRetInt1(unittest.TestCase): ...@@ -379,7 +380,7 @@ class TestDy2StIfElseRetInt1(unittest.TestCase):
return out return out
def test_ast_to_func(self): def test_ast_to_func(self):
self.assertIsInstance(self.out[0], paddle.Tensor) self.assertIsInstance(self.out[0], (paddle.Tensor, core.eager.Tensor))
self.assertIsInstance(self.out[1], int) self.assertIsInstance(self.out[1], int)
...@@ -390,8 +391,8 @@ class TestDy2StIfElseRetInt2(TestDy2StIfElseRetInt1): ...@@ -390,8 +391,8 @@ class TestDy2StIfElseRetInt2(TestDy2StIfElseRetInt1):
self.out = self.get_dy2stat_out() self.out = self.get_dy2stat_out()
def test_ast_to_func(self): def test_ast_to_func(self):
self.assertIsInstance(self.out[0], paddle.Tensor) self.assertIsInstance(self.out[0], (paddle.Tensor, core.eager.Tensor))
self.assertIsInstance(self.out[1], paddle.Tensor) self.assertIsInstance(self.out[1], (paddle.Tensor, core.eager.Tensor))
class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1): class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1):
...@@ -401,7 +402,7 @@ class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1): ...@@ -401,7 +402,7 @@ class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1):
self.out = self.get_dy2stat_out() self.out = self.get_dy2stat_out()
def test_ast_to_func(self): def test_ast_to_func(self):
self.assertIsInstance(self.out, paddle.Tensor) self.assertIsInstance(self.out, (paddle.Tensor, core.eager.Tensor))
class TestDy2StIfElseRetInt4(TestDy2StIfElseRetInt1): class TestDy2StIfElseRetInt4(TestDy2StIfElseRetInt1):
......
...@@ -118,7 +118,8 @@ class TestWithNestedOutput(unittest.TestCase): ...@@ -118,7 +118,8 @@ class TestWithNestedOutput(unittest.TestCase):
self.assertTrue(len(dygraph_res) == len(static_res)) self.assertTrue(len(dygraph_res) == len(static_res))
for dy_var, st_var in zip(dygraph_res, static_res): for dy_var, st_var in zip(dygraph_res, static_res):
if isinstance(dy_var, fluid.core.VarBase): if isinstance(dy_var,
(fluid.core.VarBase, fluid.core.eager.Tensor)):
self.assertTrue(np.allclose(dy_var.numpy(), st_var.numpy())) self.assertTrue(np.allclose(dy_var.numpy(), st_var.numpy()))
else: else:
self.assertTrue(dy_var, st_var) self.assertTrue(dy_var, st_var)
......
...@@ -218,7 +218,7 @@ class TestReturnBase(unittest.TestCase): ...@@ -218,7 +218,7 @@ class TestReturnBase(unittest.TestCase):
res = self.dygraph_func(self.input) res = self.dygraph_func(self.input)
if isinstance(res, (tuple, list)): if isinstance(res, (tuple, list)):
return tuple(r.numpy() for r in res) return tuple(r.numpy() for r in res)
elif isinstance(res, core.VarBase): elif isinstance(res, (core.VarBase, core.eager.Tensor)):
return res.numpy() return res.numpy()
return res return res
......
...@@ -713,44 +713,76 @@ class OpTest(unittest.TestCase): ...@@ -713,44 +713,76 @@ class OpTest(unittest.TestCase):
def is_empty(a): def is_empty(a):
return isinstance(a, Empty) return isinstance(a, Empty)
def get_default(idx, all_params_number, defaults): def get_default(idx, defaults):
related_idx = idx - all_params_number + len(defaults) assert not isinstance(
assert related_idx >= 0, "%d-th arguments don't have default value" % idx defaults[idx], Empty
return defaults[related_idx] ), "%d-th params of python api don't have default value." % idx
return defaults[idx]
def filter_by_name(x):
names = set(['name', 'out', 'output'])
if isinstance(x, list): return [i for i in x if i not in names]
if isinstance(x, dict):
return {k: v for k, v in x.items() if k not in names}
assert False, "Only support list or dict."
def to_defaults_list(params, defaults): def to_defaults_list(params, defaults):
return [defaults[p] for p in params if p in defaults] return [defaults[p] for p in params if p in defaults]
# NOTE(xiongkun): why don't use input arguments dicts ? def parse_attri_value(name, op_inputs, op_attrs):
# Because we don't know the python api name of each arguments. """ parse true value from inputs and attrs, if there is no name passed by OpTest, return Empty
# using parse_arg_and_kwargs, we can get the all api information we need. 1. if the name in op_attrs, use the op_attrs[name]
api_params, api_defaults = [ 2. if the name in op_inputs, convert the op_inputs to [type of default value]
filter_by_name(item) for item in parse_arg_and_kwargs(api) 3. if the name not in op_attrs ans op_inputs, return Empty. (this will use the default value from python api)
] """
if name in op_proto_attrs:
return op_proto_attrs[name]
elif name in op_inputs:
assert op_inputs[name].__len__(
) == 1, "currently don't support multi-input in attribute."
# why don't use numpy().item() : if the Tensor is float64, we will change it to python.float32, where we loss accuracy: [allclose_op]
# why we reconstruct a tensor: because we want the tensor in cpu.
return paddle.to_tensor(
op_inputs[name][0].numpy(), place='cpu')
else:
return Empty()
# NOTE(xiongkun): the logic of constructing parameters:
# for example:
# python api: cumprod(x, dim, dtype=None, name=None)
# kernel sig: [["x"], ["dim"], ["out"]]"
#
# we will construct a lot of list with the same length : len == len(api_params), here is 4
# api_params = ["x", "dim", "dtype", "name"]
# api_defaults = [Empty, Empty, None, None]; empty means no defaults.
# inputs_and_attrs = ["x", "dim"] , the length may shorter or longer than api_params
# input_arguments = [RealValue in self.inputs and self.attrs]
# then ,we will loop for the api_params, construct a result list:
# if the name in ['name', 'dtype', 'out', 'output'], we will use the default value
# else, we will consume a input_arguments. (because the name is not corresponding, so we only use the order)
api_params, api_defaults = parse_arg_and_kwargs(api)
api_defaults = to_defaults_list(api_params, api_defaults) api_defaults = to_defaults_list(api_params, api_defaults)
api_defaults = [
Empty() for i in range(len(api_params) - len(api_defaults))
] + api_defaults
assert len(api_defaults) == len(
api_params), "Error happens. contack xiongkun03 to solve."
inputs_sig, attrs_sig, outputs_sig = kernel_sig inputs_sig, attrs_sig, outputs_sig = kernel_sig
inputs_and_attrs = inputs_sig + attrs_sig inputs_and_attrs = inputs_sig + attrs_sig
assert (
len(api_params) == len(inputs_and_attrs)
), "inputs and attrs length must equals to python api length. (May be output is in argument list?)"
input_arguments = [op_proto_ins[name] for name in inputs_sig] + [ input_arguments = [op_proto_ins[name] for name in inputs_sig] + [
op_proto_attrs[name] if name in op_proto_attrs else Empty() parse_attri_value(name, op_proto_ins, op_proto_attrs)
for name in attrs_sig for name in attrs_sig
] ]
results = [] results = []
for idx, arg in enumerate(input_arguments): api_ignore_param_list = set(['name', 'dtype', 'out', 'output'])
if is_empty(arg): idx_of_op_proto_arguments = 0
results.append( for idx, arg_name in enumerate(api_params):
get_default(idx, len(input_arguments), api_defaults)) if arg_name in api_ignore_param_list:
results.append(get_default(idx, api_defaults))
else: else:
results.append(arg) assert idx_of_op_proto_arguments < len(
input_arguments), "Assert False."
tmp = input_arguments[idx_of_op_proto_arguments]
idx_of_op_proto_arguments += 1
if isinstance(tmp, Empty):
results.append(get_default(idx, api_defaults))
else:
results.append(tmp)
assert len(results) == len(api_params)
return results return results
def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
......
...@@ -251,6 +251,9 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): ...@@ -251,6 +251,9 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace())) self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
self.assertTrue(np.array_equal(egr_tensor12.numpy(), x)) self.assertTrue(np.array_equal(egr_tensor12.numpy(), x))
egr_tensor13 = paddle.randn([2, 2])
self.assertTrue("eager_tmp" in egr_tensor13.name)
with self.assertRaisesRegexp( with self.assertRaisesRegexp(
ValueError, "The shape of Parameter should not be None"): ValueError, "The shape of Parameter should not be None"):
eager_param = EagerParamBase(shape=None, dtype="float32") eager_param = EagerParamBase(shape=None, dtype="float32")
......
...@@ -64,7 +64,10 @@ class Collective_Test(unittest.TestCase): ...@@ -64,7 +64,10 @@ class Collective_Test(unittest.TestCase):
if args: if args:
cmd.extend(args.split(" ")) cmd.extend(args.split(" "))
cmd.extend([pyname]) cmd.extend([pyname])
proc = subprocess.Popen(cmd, env) env = os.environ.copy()
# virtual devies for testing
env.update({'CUDA_VISIBLE_DEVICES': '0,1,2,3,4,5,6,7'})
proc = subprocess.Popen(cmd, env=env)
return proc return proc
def test_collective_1(self): def test_collective_1(self):
......
...@@ -17,25 +17,53 @@ import unittest ...@@ -17,25 +17,53 @@ import unittest
import numpy as np import numpy as np
import paddle import paddle
from paddle import _C_ops from paddle import _C_ops
from paddle.fluid import core
from paddle.fluid.framework import _test_eager_guard from paddle.fluid.framework import _test_eager_guard
class TestSparseUtils(unittest.TestCase): class TestSparseUtils(unittest.TestCase):
def test_create_sparse_coo_tensor(self):
with _test_eager_guard():
non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
non_zero_elements = [1, 2, 3, 4, 5]
dense_shape = [3, 4]
dense_indices = paddle.to_tensor(non_zero_indices)
dense_elements = paddle.to_tensor(
non_zero_elements, dtype='float32')
stop_gradient = False
coo = core.eager.sparse_coo_tensor(dense_indices, dense_elements,
dense_shape, stop_gradient)
print(coo)
def test_create_sparse_csr_tensor(self):
with _test_eager_guard():
non_zero_crows = [0, 2, 3, 5]
non_zero_cols = [1, 3, 2, 0, 1]
non_zero_elements = [1, 2, 3, 4, 5]
dense_shape = [3, 4]
dense_crows = paddle.to_tensor(non_zero_crows)
dense_cols = paddle.to_tensor(non_zero_cols)
dense_elements = paddle.to_tensor(
non_zero_elements, dtype='float32')
stop_gradient = False
csr = core.eager.sparse_csr_tensor(dense_crows, dense_cols,
dense_elements, dense_shape,
stop_gradient)
print(csr)
def test_to_sparse_coo(self): def test_to_sparse_coo(self):
with _test_eager_guard(): with _test_eager_guard():
x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]] x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
non_zero_elements = [1, 2, 3, 4, 5] non_zero_elements = [1, 2, 3, 4, 5]
dense_x = paddle.to_tensor(x) dense_x = paddle.to_tensor(x)
#TODO(zhangkaihuo): change to test the corresponding API out = dense_x.to_sparse_coo(2)
out = _C_ops.final_state_to_sparse_coo(dense_x, 2)
print(out)
assert np.array_equal(out.non_zero_indices().numpy(), assert np.array_equal(out.non_zero_indices().numpy(),
non_zero_indices) non_zero_indices)
assert np.array_equal(out.non_zero_elements().numpy(), assert np.array_equal(out.non_zero_elements().numpy(),
non_zero_elements) non_zero_elements)
dense_tensor = _C_ops.final_state_to_dense(out) dense_tensor = out.to_dense()
assert np.array_equal(dense_tensor.numpy(), x) assert np.array_equal(dense_tensor.numpy(), x)
def test_to_sparse_csr(self): def test_to_sparse_csr(self):
...@@ -45,14 +73,14 @@ class TestSparseUtils(unittest.TestCase): ...@@ -45,14 +73,14 @@ class TestSparseUtils(unittest.TestCase):
non_zero_cols = [1, 3, 2, 0, 1] non_zero_cols = [1, 3, 2, 0, 1]
non_zero_elements = [1, 2, 3, 4, 5] non_zero_elements = [1, 2, 3, 4, 5]
dense_x = paddle.to_tensor(x) dense_x = paddle.to_tensor(x)
out = _C_ops.final_state_to_sparse_csr(dense_x) out = dense_x.to_sparse_csr()
print(out) print(out)
assert np.array_equal(out.non_zero_crows().numpy(), non_zero_crows) assert np.array_equal(out.non_zero_crows().numpy(), non_zero_crows)
assert np.array_equal(out.non_zero_cols().numpy(), non_zero_cols) assert np.array_equal(out.non_zero_cols().numpy(), non_zero_cols)
assert np.array_equal(out.non_zero_elements().numpy(), assert np.array_equal(out.non_zero_elements().numpy(),
non_zero_elements) non_zero_elements)
dense_tensor = _C_ops.final_state_to_dense(out) dense_tensor = out.to_dense()
assert np.array_equal(dense_tensor.numpy(), x) assert np.array_equal(dense_tensor.numpy(), x)
......
...@@ -307,6 +307,7 @@ packages=['paddle', ...@@ -307,6 +307,7 @@ packages=['paddle',
'paddle.distributed.auto_parallel', 'paddle.distributed.auto_parallel',
'paddle.distributed.auto_parallel.operators', 'paddle.distributed.auto_parallel.operators',
'paddle.distributed.auto_parallel.tuner', 'paddle.distributed.auto_parallel.tuner',
'paddle.distributed.auto_parallel.cost',
'paddle.distributed.passes', 'paddle.distributed.passes',
'paddle.framework', 'paddle.framework',
'paddle.jit', 'paddle.jit',
......
...@@ -22,7 +22,9 @@ attr_type_converter = { ...@@ -22,7 +22,9 @@ attr_type_converter = {
"i": 'SI32Attr', "i": 'SI32Attr',
"b": 'BoolAttr', "b": 'BoolAttr',
"l": 'SI64Attr', "l": 'SI64Attr',
"f": 'F32Attr' "f": 'F32Attr',
"NSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE": 'StrAttr',
"St6vectorIiSaIiEE": 'I32ArrayAttr'
} }
target_type_converter = {"CPU": "CPU", "GPU": "GPU"} target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
......
...@@ -38,35 +38,36 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \ ...@@ -38,35 +38,36 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
--wrapped_infermeta_header_path ${temp_path}/generate.h \ --wrapped_infermeta_header_path ${temp_path}/generate.h \
--wrapped_infermeta_source_path ${temp_path}/generate.cc --wrapped_infermeta_source_path ${temp_path}/generate.cc
grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc \ find ${PADDLE_ROOT}/paddle/phi/ -name "*.cc" | xargs grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc \
| awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
#step 3:get ir's attr_name. #step 3:get ir's attr_name.
ir_attr_name_info_file=`mktemp` ir_attr_name_info_file=`mktemp`
# phi_cpu attr # phi_cpu attr
all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
for ir in $all_ir_name for ir in $all_ir_name
do do
attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \ attr_name=`grep "<\"$ir" -A 3 ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \
| awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \ | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
gsub(/I32ArrayAttr/,"");gsub(/SI32ArrayAttr/,""); \
gsub(/Attr/,"");gsub(/\)/,""); \ gsub(/Attr/,"");gsub(/\)/,""); \
gsub(/[,:]/,"");print $a}'` gsub(/[,:]/,"");print $a}'`
echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file
done done
# phi_gpu attr # phi_gpu attr
all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
for ir in $all_ir_name for ir in $all_ir_name
do do
attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \ attr_name=`grep "<\"$ir" -A 3 ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \
| awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \ | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
gsub(/Attr/,"");gsub(/\)/,""); \ gsub(/I32ArrayAttr/,"");gsub(/SI32ArrayAttr/,""); \
gsub(/Attr/,"");gsub(/\)/,"") \
gsub(/[,:]/,"");print $a}'` gsub(/[,:]/,"");print $a}'`
echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file
done done
......
...@@ -58,7 +58,7 @@ def get_api_yaml_info(file_path): ...@@ -58,7 +58,7 @@ def get_api_yaml_info(file_path):
def get_kernel_info(file_path): def get_kernel_info(file_path):
f = open(file_path, "r") f = open(file_path, "r")
cont = f.readlines() cont = f.readlines()
return [l.strip() for l in cont] return [l.strip() for l in cont if l.strip() != ""]
def get_attr_info(file_path): def get_attr_info(file_path):
...@@ -91,11 +91,10 @@ def merge(infer_meta_data, kernel_data, wrap_data): ...@@ -91,11 +91,10 @@ def merge(infer_meta_data, kernel_data, wrap_data):
full_kernel_data = [] full_kernel_data = []
for l in kernel_data: for l in kernel_data:
key = l.split()[0] key = l.split()[0]
if key in meta_map: if key in wrap_map:
if key in meta_map: full_kernel_data.append((l + " " + wrap_map[key]).split())
full_kernel_data.append((l + " " + wrap_map[key]).split()) elif key in meta_map:
else: full_kernel_data.append((l + " " + meta_map[key]).split())
full_kernel_data.append((l + " " + meta_map[key]).split())
else: else:
full_kernel_data.append((l + " unknown").split()) full_kernel_data.append((l + " unknown").split())
...@@ -246,15 +245,10 @@ def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]): ...@@ -246,15 +245,10 @@ def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]):
registry->AddKernelWithAttrs("{ir_name}",""" registry->AddKernelWithAttrs("{ir_name}","""
res += f""" res += f"""
std::bind(&KernelLauncherFunc<decltype({kernel_func}), &KernelLauncherFunc<decltype({kernel_func}),
{kernel_func}, {kernel_func},
decltype({infer_shape_func}), decltype({infer_shape_func}),
{infer_shape_func}>, {infer_shape_func}>,
KernelLauncher<decltype({kernel_func}),
{kernel_func},
decltype({infer_shape_func}),
{infer_shape_func}>(),
std::placeholders::_1),
{{{attr_names}}}); {{{attr_names}}});
""" """
...@@ -263,15 +257,10 @@ registry->AddKernelWithAttrs("{ir_name}",""" ...@@ -263,15 +257,10 @@ registry->AddKernelWithAttrs("{ir_name}","""
registry->AddKernel("{ir_name}",""" registry->AddKernel("{ir_name}","""
res += f""" res += f"""
std::bind(&KernelLauncherFunc<decltype({kernel_func}), &KernelLauncherFunc<decltype({kernel_func}),
{kernel_func},
decltype({infer_shape_func}),
{infer_shape_func}>,
KernelLauncher<decltype({kernel_func}),
{kernel_func}, {kernel_func},
decltype({infer_shape_func}), decltype({infer_shape_func}),
{infer_shape_func}>(), {infer_shape_func}>);
std::placeholders::_1));
""" """
return res return res
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import json
skip_list = []
def remove_grad_kernel(kernels):
clean_kernels = []
for kernel_ in kernels:
if (not "_grad" in kernel_):
clean_kernels.append(kernel_)
return clean_kernels
CPU_KERNEL_REGISTER = "REGISTER_OP_CPU_KERNEL("
GPU_KERNEL_REGISTER = "REGISTER_OP_CUDA_KERNEL("
XPU_KERNEL_REGISTER = "REGISTER_OP_XPU_KERNEL("
def get_compat_kernels_info(register):
kernels_info = {}
kernel_names = []
for dirpath, dirnames, filenames in os.walk("../../paddle/fluid/operators"):
for file_name in filenames:
if not ".cc" in file_name:
continue
with open(os.path.join(dirpath, file_name)) as f:
txt = f.readlines()
content = ""
registry = False
is_macro_defination = False
for line in txt:
if line.strip().startswith("#define") and line.strip(
).endswith("\\"):
is_macro_defination = True
continue
if is_macro_defination:
if not line.strip().endswith("\\"):
is_macro_defination = False
continue
if (register in line):
content = ""
registry = True
if (registry):
content += line
if (registry and ";" in line):
kernel_name = content.replace("\n", "").replace(
" ", "").strip(register).split(",")
registry = False
kernel_names.append(kernel_name[0])
return remove_grad_kernel(kernel_names)
def show_kernel_statistics(backend, kernels):
print("=== kernels statistics === ")
print("the number of " + backend + " kernels is: " + str(len(kernels)) +
"\n")
print(kernels)
print("\n")
def show_pass_statistics(backend, passes):
print("=== Passes Statistics === ")
print("The number of " + backend + " passes is: " + str(len(passes)) + "\n")
print(passes)
print("\n")
def get_passes_info(register):
pass_registry_func = ""
with open("../../paddle/fluid/inference/api/paddle_pass_builder.cc") as f:
txt = f.readlines()
stack = []
registry_fun_found = False
for line in txt:
if line.strip().startswith("//"):
continue
if register in line:
registry_fun_found = True
if (registry_fun_found):
pass_registry_func += line
if registry_fun_found:
for char in line:
if char == "{":
stack.append(char)
if char == "}":
stack.pop()
if len(stack) == 0:
registry_fun_found = False
pass_list = re.findall("\"(.+?)_pass\"", pass_registry_func)
return pass_list
if __name__ == "__main__":
cpu_kernels = get_compat_kernels_info(CPU_KERNEL_REGISTER)
gpu_kernels = get_compat_kernels_info(GPU_KERNEL_REGISTER)
xpu_kernels = get_compat_kernels_info(XPU_KERNEL_REGISTER)
show_kernel_statistics("CPU", cpu_kernels)
show_kernel_statistics("GPU", gpu_kernels)
show_kernel_statistics("XPU", xpu_kernels)
cpu_passes = get_passes_info("CpuPassStrategy::CpuPassStrategy()")
gpu_passes = get_passes_info("GpuPassStrategy::GpuPassStrategy()")
show_pass_statistics("CPU", cpu_passes)
show_pass_statistics("GPU", gpu_passes)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册