提交 7ab3f36e 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

...@@ -100,7 +100,6 @@ function(kernel_library TARGET) ...@@ -100,7 +100,6 @@ function(kernel_library TARGET)
set(xpu_srcs) set(xpu_srcs)
set(gpudnn_srcs) set(gpudnn_srcs)
set(kps_srcs) set(kps_srcs)
set(selected_rows_srcs)
# parse and save the deps kerenl targets # parse and save the deps kerenl targets
set(all_srcs) set(all_srcs)
set(kernel_deps) set(kernel_deps)
...@@ -112,6 +111,12 @@ function(kernel_library TARGET) ...@@ -112,6 +111,12 @@ function(kernel_library TARGET)
cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}" cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN}) "${multiValueArgs}" ${ARGN})
# used for cc_library selected_rows dir target
set(target_suffix "")
if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows_kernel")
set(target_suffix "_sr")
endif()
list(LENGTH kernel_library_SRCS kernel_library_SRCS_len) list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
# one kernel only match one impl file in each backend # one kernel only match one impl file in each backend
if (${kernel_library_SRCS_len} EQUAL 0) if (${kernel_library_SRCS_len} EQUAL 0)
...@@ -121,9 +126,6 @@ function(kernel_library TARGET) ...@@ -121,9 +126,6 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc) list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
list(APPEND selected_rows_srcs ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
endif()
if (WITH_GPU OR WITH_ROCM) if (WITH_GPU OR WITH_ROCM)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
...@@ -169,26 +171,46 @@ function(kernel_library TARGET) ...@@ -169,26 +171,46 @@ function(kernel_library TARGET)
list(APPEND all_srcs ${xpu_srcs}) list(APPEND all_srcs ${xpu_srcs})
list(APPEND all_srcs ${gpudnn_srcs}) list(APPEND all_srcs ${gpudnn_srcs})
list(APPEND all_srcs ${kps_srcs}) list(APPEND all_srcs ${kps_srcs})
set(all_include_kernels)
set(all_kernel_name)
foreach(src ${all_srcs}) foreach(src ${all_srcs})
file(READ ${src} target_content) file(READ ${src} target_content)
# "kernels/xxx"(DenseTensor Kernel) can only include each other, but can't include "SUB_DIR/xxx" (such as selected_rows Kernel)
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
if ("${kernel_library_SUB_DIR}" STREQUAL "") list(APPEND all_include_kernels ${include_kernels})
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
else() # "SUB_DIR/xxx" can include "kernels/xx" and "SUB_DIR/xxx"
if (NOT "${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
list(APPEND all_include_kernels ${include_kernels})
endif() endif()
foreach(include_kernel ${include_kernels})
foreach(include_kernel ${all_include_kernels})
if ("${kernel_library_SUB_DIR}" STREQUAL "") if ("${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
else() else()
# NOTE(dev): we should firstly match kernel_library_SUB_DIR.
if (${include_kernel} MATCHES "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/")
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
endif() # for selected_rows directory, add ${target_suffix}.
string(REGEX REPLACE ".h\"" "${target_suffix}" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
else()
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND kernel_deps ${kernel_name}) list(APPEND all_kernel_name ${kernel_name})
endif()
message(STATUS "${TARGET} DEPS ${all_kernel_name}")
endif()
list(APPEND kernel_deps ${all_kernel_name})
endforeach() endforeach()
endforeach() endforeach()
list(REMOVE_DUPLICATES kernel_deps) list(REMOVE_DUPLICATES kernel_deps)
list(REMOVE_ITEM kernel_deps ${TARGET}) list(REMOVE_ITEM kernel_deps ${TARGET}${target_suffix})
list(LENGTH common_srcs common_srcs_len) list(LENGTH common_srcs common_srcs_len)
list(LENGTH cpu_srcs cpu_srcs_len) list(LENGTH cpu_srcs cpu_srcs_len)
...@@ -196,92 +218,73 @@ function(kernel_library TARGET) ...@@ -196,92 +218,73 @@ function(kernel_library TARGET)
list(LENGTH xpu_srcs xpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH gpudnn_srcs gpudnn_srcs_len) list(LENGTH gpudnn_srcs gpudnn_srcs_len)
list(LENGTH kps_srcs kps_srcs_len) list(LENGTH kps_srcs kps_srcs_len)
list(LENGTH selected_rows_srcs selected_rows_srcs_len)
# kernel source file level # kernel source file level
# level 1: base device kernel # level 1: base device kernel
# - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs
# level 2: device-independent kernel # level 2: device-independent kernel
# - common_srcs # - common_srcs
# level 3: Kernel implemented by reusing device-independent kernel
# - selected_rows_srcs
set(base_device_kernels) set(base_device_kernels)
set(device_independent_kernel) set(device_independent_kernel)
set(high_level_kernels)
# 1. Base device kernel compile # 1. Base device kernel compile
if (${cpu_srcs_len} GREATER 0) if (${cpu_srcs_len} GREATER 0)
cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET}_cpu${target_suffix} SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_cpu) list(APPEND base_device_kernels ${TARGET}_cpu${target_suffix})
endif() endif()
if (${gpu_srcs_len} GREATER 0) if (${gpu_srcs_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) nv_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif() endif()
list(APPEND base_device_kernels ${TARGET}_gpu) list(APPEND base_device_kernels ${TARGET}_gpu${target_suffix})
endif() endif()
if (${xpu_srcs_len} GREATER 0) if (${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET}_xpu${target_suffix} SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_xpu) list(APPEND base_device_kernels ${TARGET}_xpu${target_suffix})
endif() endif()
if (${gpudnn_srcs_len} GREATER 0) if (${gpudnn_srcs_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) nv_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif() endif()
list(APPEND base_device_kernels ${TARGET}_gpudnn) list(APPEND base_device_kernels ${TARGET}_gpudnn${target_suffix})
endif() endif()
if (${kps_srcs_len} GREATER 0) if (${kps_srcs_len} GREATER 0)
# only when WITH_XPU_KP, the kps_srcs_len can be > 0 # only when WITH_XPU_KP, the kps_srcs_len can be > 0
xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) xpu_library(${TARGET}_kps${target_suffix} SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_kps) list(APPEND base_device_kernels ${TARGET}_kps${target_suffix})
endif() endif()
# 2. Device-independent kernel compile # 2. Device-independent kernel compile
if (${common_srcs_len} GREATER 0) if (${common_srcs_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) nv_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) hip_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_XPU_KP) elseif (WITH_XPU_KP)
xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) xpu_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
else() else()
cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) cc_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
endif() endif()
list(APPEND device_independent_kernel ${TARGET}_common) list(APPEND device_independent_kernel ${TARGET}_common${target_suffix})
endif() endif()
# 3. Reusing kernel compile
if (${selected_rows_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_ROCM)
hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
else()
cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
endif()
list(APPEND high_level_kernels ${TARGET}_sr)
endif()
# 4. Unify target compile # 3. Unify target compile
list(LENGTH base_device_kernels base_device_kernels_len) list(LENGTH base_device_kernels base_device_kernels_len)
list(LENGTH device_independent_kernel device_independent_kernel_len) list(LENGTH device_independent_kernel device_independent_kernel_len)
list(LENGTH high_level_kernels high_level_kernels_len) if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0)
if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR
${high_level_kernels_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) nv_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) hip_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_XPU_KP) elseif (WITH_XPU_KP)
xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) xpu_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
else() else()
cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) cc_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
endif() endif()
else() else()
set(target_build_flag 0) set(target_build_flag 0)
...@@ -290,10 +293,10 @@ function(kernel_library TARGET) ...@@ -290,10 +293,10 @@ function(kernel_library TARGET)
if (${target_build_flag} EQUAL 1) if (${target_build_flag} EQUAL 1)
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) ${gpudnn_srcs_len} GREATER 0)
# append target into PHI_KERNELS property # append target into PHI_KERNELS property
get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
set(phi_kernels ${phi_kernels} ${TARGET}) set(phi_kernels ${phi_kernels} ${TARGET}${target_suffix})
set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels}) set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels})
endif() endif()
...@@ -318,9 +321,6 @@ function(kernel_library TARGET) ...@@ -318,9 +321,6 @@ function(kernel_library TARGET)
if (${kps_srcs_len} GREATER 0) if (${kps_srcs_len} GREATER 0)
kernel_declare(${kps_srcs}) kernel_declare(${kps_srcs})
endif() endif()
if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs})
endif()
endif() endif()
endfunction() endfunction()
......
...@@ -219,13 +219,13 @@ message GraphParameter { ...@@ -219,13 +219,13 @@ message GraphParameter {
optional string gpups_graph_sample_class = 3 optional string gpups_graph_sample_class = 3
[ default = "CompleteGraphSampler" ]; [ default = "CompleteGraphSampler" ];
optional string gpups_graph_sample_args = 4 [ default = "" ]; optional string gpups_graph_sample_args = 4 [ default = "" ];
optional bool use_cache = 5 [ default = true ]; optional bool use_cache = 5 [ default = false ];
optional float cache_ratio = 6 [ default = 0.3 ]; optional int32 cache_size_limit = 6 [ default = 100000 ];
optional int32 cache_ttl = 7 [ default = 5 ]; optional int32 cache_ttl = 7 [ default = 5 ];
optional GraphFeature graph_feature = 8; optional GraphFeature graph_feature = 8;
optional string table_name = 9 [ default = "" ]; optional string table_name = 9 [ default = "" ];
optional string table_type = 10 [ default = "" ]; optional string table_type = 10 [ default = "" ];
optional int32 gpups_mode_shard_num = 11 [ default = 127 ]; optional int32 shard_num = 11 [ default = 127 ];
optional int32 gpu_num = 12 [ default = 1 ]; optional int32 gpu_num = 12 [ default = 1 ];
} }
......
...@@ -138,7 +138,6 @@ int BasicBfsGraphSampler::run_graph_sampling() { ...@@ -138,7 +138,6 @@ int BasicBfsGraphSampler::run_graph_sampling() {
int init_size = 0; int init_size = 0;
//__sync_fetch_and_add //__sync_fetch_and_add
std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int { std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int {
VLOG(0) << "in bfs " << i << " " << id;
if (this->status == GraphSamplerStatus::terminating) { if (this->status == GraphSamplerStatus::terminating) {
int task_left = __sync_sub_and_fetch(&task_size, 1); int task_left = __sync_sub_and_fetch(&task_size, 1);
if (task_left == 0) { if (task_left == 0) {
...@@ -148,13 +147,13 @@ int BasicBfsGraphSampler::run_graph_sampling() { ...@@ -148,13 +147,13 @@ int BasicBfsGraphSampler::run_graph_sampling() {
} }
size_t ind = i % this->graph_table->task_pool_size_; size_t ind = i % this->graph_table->task_pool_size_;
if (nodes_left[i] > 0) { if (nodes_left[i] > 0) {
nodes_left[i]--;
auto iter = sample_neighbors_map[ind].find(id); auto iter = sample_neighbors_map[ind].find(id);
if (iter == sample_neighbors_map[ind].end()) { if (iter == sample_neighbors_map[ind].end()) {
sample_neighbors_map[ind][id] = std::vector<int64_t>();
iter = sample_neighbors_map[ind].find(id);
Node *node = graph_table->shards[i]->find_node(id); Node *node = graph_table->shards[i]->find_node(id);
if (node != NULL) { if (node != NULL) {
nodes_left[i]--;
sample_neighbors_map[ind][id] = std::vector<int64_t>();
iter = sample_neighbors_map[ind].find(id);
size_t edge_fetch_size = size_t edge_fetch_size =
std::min((size_t) this->edge_num_for_each_node, std::min((size_t) this->edge_num_for_each_node,
node->get_neighbor_size()); node->get_neighbor_size());
...@@ -179,11 +178,14 @@ int BasicBfsGraphSampler::run_graph_sampling() { ...@@ -179,11 +178,14 @@ int BasicBfsGraphSampler::run_graph_sampling() {
for (size_t i = 0; i < graph_table->shards.size(); ++i) { for (size_t i = 0; i < graph_table->shards.size(); ++i) {
std::vector<Node *> &v = graph_table->shards[i]->get_bucket(); std::vector<Node *> &v = graph_table->shards[i]->get_bucket();
if (v.size() > 0) { if (v.size() > 0) {
int search_size = std::min(init_search_size, (int)v.size());
for (int k = 0; k < search_size; k++) {
init_size++; init_size++;
__sync_add_and_fetch(&task_size, 1); __sync_add_and_fetch(&task_size, 1);
int64_t id = v[0]->get_id(); int64_t id = v[k]->get_id();
graph_table->_shards_task_pool[i % graph_table->task_pool_size_] graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
->enqueue(bfs, i, id); ->enqueue(bfs, i, id);
}
} // if } // if
} }
if (init_size == 0) { if (init_size == 0) {
...@@ -301,10 +303,11 @@ void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table, ...@@ -301,10 +303,11 @@ void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
std::vector<std::string> args) { std::vector<std::string> args) {
this->gpu_num = gpu_num; this->gpu_num = gpu_num;
this->graph_table = graph_table; this->graph_table = graph_table;
node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10; init_search_size = args.size() > 0 ? std::stoi(args[0]) : 10;
edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10; node_num_for_each_shard = args.size() > 1 ? std::stoi(args[1]) : 10;
rounds = args.size() > 2 ? std::stoi(args[2]) : 1; edge_num_for_each_node = args.size() > 2 ? std::stoi(args[2]) : 10;
interval = args.size() > 3 ? std::stoi(args[3]) : 60; rounds = args.size() > 3 ? std::stoi(args[3]) : 1;
interval = args.size() > 4 ? std::stoi(args[4]) : 60;
} }
#endif #endif
...@@ -1092,11 +1095,6 @@ int32_t GraphTable::initialize(const GraphParameter &graph) { ...@@ -1092,11 +1095,6 @@ int32_t GraphTable::initialize(const GraphParameter &graph) {
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
if (graph.gpups_mode()) { if (graph.gpups_mode()) {
gpups_mode = true; gpups_mode = true;
if (shard_num == 0) {
shard_num = graph.gpups_mode_shard_num();
server_num = 1;
_shard_idx = 0;
}
auto *sampler = auto *sampler =
CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class()); CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class());
auto slices = auto slices =
...@@ -1107,7 +1105,18 @@ int32_t GraphTable::initialize(const GraphParameter &graph) { ...@@ -1107,7 +1105,18 @@ int32_t GraphTable::initialize(const GraphParameter &graph) {
graph_sampler.reset(sampler); graph_sampler.reset(sampler);
} }
#endif #endif
if (shard_num == 0) {
server_num = 1;
_shard_idx = 0;
shard_num = graph.shard_num();
}
task_pool_size_ = graph.task_pool_size(); task_pool_size_ = graph.task_pool_size();
use_cache = graph.use_cache();
if (use_cache) {
cache_size_limit = graph.cache_size_limit();
cache_ttl = graph.cache_ttl();
make_neighbor_sample_cache((size_t)cache_size_limit, (size_t)cache_ttl);
}
_shards_task_pool.resize(task_pool_size_); _shards_task_pool.resize(task_pool_size_);
for (size_t i = 0; i < _shards_task_pool.size(); ++i) { for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
_shards_task_pool[i].reset(new ::ThreadPool(1)); _shards_task_pool[i].reset(new ::ThreadPool(1));
......
...@@ -547,6 +547,8 @@ class GraphTable : public SparseTable { ...@@ -547,6 +547,8 @@ class GraphTable : public SparseTable {
std::unordered_set<int64_t> extra_nodes; std::unordered_set<int64_t> extra_nodes;
std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index; std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
bool use_cache, use_duplicate_nodes; bool use_cache, use_duplicate_nodes;
int cache_size_limit;
int cache_ttl;
mutable std::mutex mutex_; mutable std::mutex mutex_;
std::shared_ptr<pthread_rwlock_t> rw_lock; std::shared_ptr<pthread_rwlock_t> rw_lock;
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
...@@ -593,7 +595,7 @@ class BasicBfsGraphSampler : public GraphSampler { ...@@ -593,7 +595,7 @@ class BasicBfsGraphSampler : public GraphSampler {
std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes; std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
std::vector<std::vector<int64_t>> sample_neighbors; std::vector<std::vector<int64_t>> sample_neighbors;
size_t gpu_num; size_t gpu_num;
int node_num_for_each_shard, edge_num_for_each_node; int init_search_size, node_num_for_each_shard, edge_num_for_each_node;
int rounds, interval; int rounds, interval;
std::vector<std::unordered_map<int64_t, std::vector<int64_t>>> std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
sample_neighbors_map; sample_neighbors_map;
......
...@@ -456,7 +456,7 @@ void RunBrpcPushSparse() { ...@@ -456,7 +456,7 @@ void RunBrpcPushSparse() {
pull_status.wait(); pull_status.wait();
ASSERT_EQ(_vs[0].size(), vs1[0].size()); ASSERT_EQ(_vs[0].size(), vs1[0].size());
for (int j = 0; j < _vs[0].size(); j++) { for (size_t j = 0; j < _vs[0].size(); j++) {
ASSERT_EQ(_vs[0][j], vs1[0][j]); ASSERT_EQ(_vs[0][j], vs1[0][j]);
} }
} }
......
...@@ -86,7 +86,7 @@ void testGraphSample() { ...@@ -86,7 +86,7 @@ void testGraphSample() {
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
::paddle::distributed::GraphParameter table_proto; ::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(true); table_proto.set_gpups_mode(true);
table_proto.set_gpups_mode_shard_num(127); table_proto.set_shard_num(127);
table_proto.set_gpu_num(2); table_proto.set_gpu_num(2);
distributed::GraphTable graph_table, graph_table1; distributed::GraphTable graph_table, graph_table1;
...@@ -113,7 +113,7 @@ void testGraphSample() { ...@@ -113,7 +113,7 @@ void testGraphSample() {
::paddle::distributed::GraphParameter table_proto1; ::paddle::distributed::GraphParameter table_proto1;
table_proto1.set_gpups_mode(true); table_proto1.set_gpups_mode(true);
table_proto1.set_gpups_mode_shard_num(127); table_proto1.set_shard_num(127);
table_proto1.set_gpu_num(2); table_proto1.set_gpu_num(2);
table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler"); table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler");
table_proto1.set_gpups_graph_sample_args("5,5,1,1"); table_proto1.set_gpups_graph_sample_args("5,5,1,1");
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import yaml
import re
import argparse
import os
########################
### Global Variables ###
########################
ops_to_fill_zero_for_empty_grads = set(list("split"))
# For API dispatch used at python-level
# { op_name : [arg_name, ...] }
core_ops_returns_info = {}
core_ops_args_info = {}
core_ops_args_type_info = {}
yaml_types_mapping = {
'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \
'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
'str' : 'std::string', \
'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
'Tensor' : 'Tensor',
'Tensor[]' : 'std::vector<Tensor>',
'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
'Scalar' : 'paddle::experimental::Scalar',
'ScalarArray' : 'paddle::experimental::ScalarArray'
}
#############################
### File Reader Helpers ###
#############################
def ReadFwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
f.close()
return contents
def ReadBwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
ret = {}
for content in contents:
if 'backward_api' in content.keys():
api_name = content['backward_api']
else:
assert False
ret[api_name] = content
f.close()
return ret
##################################
### Generic Helper Functions ###
##################################
def FindGradName(string):
return string + "_grad"
def FindForwardName(string):
if not string.endswith("_grad"):
return None
return string[:-5]
def IsPlainTensorType(string):
plain_tensor_types = ['Tensor&', 'Tensor', 'const Tensor&', 'const Tensor']
if string in plain_tensor_types:
return True
return False
def IsVectorTensorType(string):
vector_tensor_types = [
'std::vector<std::vector<Tensor>>', 'std::vector<Tensor>'
]
if string in vector_tensor_types:
return True
return False
def GetSavedName(string):
return string + "_"
def GetConstReference(string):
ret = string
if not string.startswith("const "):
ret = "const " + string
if not string.endswith("&"):
ret += "&"
return ret
def RemoveConstAndReference(string):
ret = string
if string.startswith("const "):
ret = ret[6:]
if string.endswith("&"):
ret = ret[:-1]
return ret
def GetGradNodeName(string):
return f"FinalGradNode{string}"
def GetDygraphForwardFunctionName(string):
return f"{string}_final_state_dygraph_function"
def GetIntermediateAPIFunctionName(string):
return string + "_intermediate"
def GetAutoGradMetaName(string):
return f"{string}_autograd_meta"
def GetAutoGradMetaVectorName(string):
return f"{string}_autograd_meta_vec"
def RemoveSpecialSymbolsInName(string):
# Remove any name after '@'
ret = string.split("@")[0]
return ret
def RecoverBaseNameOfInplaceFunction(function_name):
return function_name[:-1]
def GetInplacedFunctionName(function_name):
return function_name + "_"
def GetForwardFunctionName(string):
return f"{string}_final_state_dygraph_function"
######################
### Yaml Parsers ###
######################
def ParseYamlArgs(string):
# Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y
# inputs_list = [ [arg_name, arg_type, orig_position], ...]
inputs_list = []
# attrs_list = [ [arg_name, arg_type, default_value, orig_position], ...]
attrs_list = []
args = [x.strip() for x in string.strip().split(",")]
atype = r'((const )?\S+) '
aname = r'(.*)'
pattern = f'{atype}{aname}'
for i in range(len(args)):
arg = args[i]
m = re.search(pattern, arg)
arg_type = m.group(1).strip()
arg_name = m.group(3).split("=")[0].strip()
default_value = m.group(3).split("=")[1].strip() if len(
m.group(3).split("=")) > 1 else None
assert arg_type in yaml_types_mapping.keys(
), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
arg_type = yaml_types_mapping[arg_type]
arg_name = RemoveSpecialSymbolsInName(arg_name)
if "Tensor" in arg_type:
assert default_value is None
inputs_list.append([arg_name, arg_type, i])
else:
attrs_list.append([arg_name, arg_type, default_value, i])
return inputs_list, attrs_list
def ParseYamlReturns(string):
# Example0: Tensor(out), Tensor(out1)
# Example1: Tensor, Tensor
# Example2: Tensor[](out), Tensor
# list = [ [ret_name, ret_type, orig_position], ...]
returns_list = []
returns = [x.strip() for x in string.strip().split(",")]
for i in range(len(returns)):
ret = returns[i]
ret_name = ""
if "(" in ret and ")" in ret:
# Remove trailing ')'
ret = ret[:-1]
ret_type = ret.split("(")[0].strip()
ret_name = ret.split("(")[1].strip()
else:
ret_type = ret.strip()
assert ret_type in yaml_types_mapping.keys(
), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
ret_type = yaml_types_mapping[ret_type]
assert "Tensor" in ret_type
ret_name = RemoveSpecialSymbolsInName(ret_name)
returns_list.append([ret_name, ret_type, i])
return returns_list
def ParseYamlForwardFromBackward(string):
# Example: matmul (const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y) -> Tensor(out)
fname = r'(.*?)'
wspace = r'\s*'
fargs = r'(.*?)'
frets = r'(.*)'
pattern = f'{fname}{wspace}\({wspace}{fargs}{wspace}\){wspace}->{wspace}{frets}'
m = re.search(pattern, string)
function_name = m.group(1)
function_args = m.group(2)
function_returns = m.group(3)
forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args)
forward_returns_list = ParseYamlReturns(function_returns)
return forward_inputs_list, forward_attrs_list, forward_returns_list
def ParseYamlForward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
# returns Example: Tensor, Tensor
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
return inputs_list, attrs_list, returns_list
def ParseYamlBackward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false)
# returns Example: Tensor(x_grad), Tensor(y_grad)
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
return inputs_list, attrs_list, returns_list
########################
### Generator Base ###
########################
class FunctionGeneratorBase:
def __init__(self, forward_api_contents, namespace):
self.forward_api_contents = forward_api_contents
self.namespace = namespace
self.forward_api_name = ""
self.orig_forward_inputs_list = [
] #[ [arg_name, arg_type, orig_position], ...]
self.orig_forward_attrs_list = [
] #[ [attr_name, attr_type, default_value, orig_position], ...]
self.orig_forward_returns_list = [
] #[ [ret_name, ret_type, orig_position], ...]
# Processed Forward Data
self.forward_inputs_position_map = {
} #{ "name" : [type, fwd_position] }
self.forward_outputs_position_map = {
} #{ "name" : [type, fwd_position] }
# Special Op Attributes
self.optional_inputs = [] #[name, ...]
self.no_need_buffers = [] #[name, ...]
self.intermediate_outputs = [] #[name, ...]
self.inplace_map = {} #{name : name, ...}
def ParseInplaceInfo(self):
forward_api_contents = self.forward_api_contents
if 'inplace' not in forward_api_contents.keys(): return
# inplace_map_str: "(x -> out0), (y -> out2)"
inplace_map_str = forward_api_contents['inplace']
for pair in inplace_map_str.split(","):
pair = pair.strip()
if pair.startswith("("):
pair = pair[1:]
if pair.endswith(")"):
pair = pair[:-1]
key = pair.split("->")[0].strip()
val = pair.split("->")[1].strip()
self.inplace_map[key] = val
def ParseNoNeedBuffer(self):
forward_api_contents = self.forward_api_contents
if 'no_need_buffer' in forward_api_contents.keys():
no_need_buffer_str = forward_api_contents['no_need_buffer']
for name in no_need_buffer_str.split(","):
name = name.strip()
name = RemoveSpecialSymbolsInName(name)
self.no_need_buffers.append(name.strip())
def ParseDispensable(self):
forward_api_contents = self.forward_api_contents
if 'optional' in forward_api_contents.keys():
optional_inputs_str = forward_api_contents['optional']
for name in optional_inputs_str.split(","):
name = name.strip()
name = RemoveSpecialSymbolsInName(name)
self.optional_inputs.append(name)
def ParseIntermediate(self):
forward_api_contents = self.forward_api_contents
if 'intermediate' in forward_api_contents.keys():
intermediate_str = forward_api_contents['intermediate']
for name in intermediate_str.split(","):
name = name.strip()
name = RemoveSpecialSymbolsInName(name)
self.intermediate_outputs.append(name)
def CollectOriginalForwardInfo(self):
forward_api_contents = self.forward_api_contents
self.forward_api_name = forward_api_contents['api']
forward_args_str = forward_api_contents['args']
forward_returns_str = forward_api_contents['output']
assert 'api' in forward_api_contents.keys(
), "Unable to find \"api\" in forward_api_contents keys"
assert 'args' in forward_api_contents.keys(
), "Unable to find \"args\" in forward_api_contents keys"
assert 'output' in forward_api_contents.keys(
), "Unable to find \"output\" in forward_api_contents keys"
# Collect Original Forward Inputs/Outputs and then perform validation checks
self.orig_forward_inputs_list, self.orig_forward_attrs_list, self.orig_forward_returns_list = ParseYamlForward(
forward_args_str, forward_returns_str)
def DetermineForwardPositionMap(self, forward_inputs_list,
forward_returns_list):
for i in range(len(forward_inputs_list)):
forward_input = forward_inputs_list[i]
input_name = forward_input[0]
input_type = forward_input[1]
input_pos = forward_input[2]
self.forward_inputs_position_map[
input_name] = [input_type, input_pos]
for i in range(len(forward_returns_list)):
forward_return = forward_returns_list[i]
return_name = forward_return[0]
return_type = forward_return[1]
return_pos = forward_return[2]
self.forward_outputs_position_map[
return_name] = [return_type, return_pos]
print("Generated Forward Input Position Map: ",
self.forward_inputs_position_map)
print("Generated Forward Output Position Map: ",
self.forward_outputs_position_map)
class YamlGeneratorBase:
def __init__(self, api_yaml_path):
self.namespace = ""
self.api_yaml_path = api_yaml_path
self.forward_api_list = []
def ParseForwardYamlContents(self):
api_yaml_path = self.api_yaml_path
self.forward_api_list = ReadFwdFile(api_yaml_path)
def InferNameSpace(self):
api_yaml_path = self.api_yaml_path
if "sparse" in api_yaml_path:
self.namespace = "sparse::"
...@@ -15,7 +15,10 @@ ...@@ -15,7 +15,10 @@
import os import os
import argparse import argparse
import logging import logging
from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap, GetInplacedFunctionName, ParseInplaceInfo from codegen_utils import FunctionGeneratorBase, YamlGeneratorBase
from codegen_utils import yaml_types_mapping
from codegen_utils import ReadFwdFile, IsVectorTensorType, GetForwardFunctionName
from codegen_utils import ParseYamlForward, GetInplacedFunctionName
########################### ###########################
## Global Configurations ## ## Global Configurations ##
...@@ -121,7 +124,10 @@ FUNCTION_NAME_TEMPLATE = \ ...@@ -121,7 +124,10 @@ FUNCTION_NAME_TEMPLATE = \
PYTHON_C_FUNCTION_REG_TEMPLATE = \ PYTHON_C_FUNCTION_REG_TEMPLATE = \
"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}" """
{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}
"""
PYTHON_C_WRAPPER_TEMPLATE = \ PYTHON_C_WRAPPER_TEMPLATE = \
...@@ -229,77 +235,39 @@ NAMESPACE_WRAPPER_TEMPLATE = \ ...@@ -229,77 +235,39 @@ NAMESPACE_WRAPPER_TEMPLATE = \
####################### #######################
## Generator Classes ## ## Generator Classes ##
####################### #######################
class PythonCSingleFunctionGenerator: class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
def __init__(self, fwd_api_contents, namespace): def __init__(self, forward_api_contents, namespace):
self.fwd_api_contents = fwd_api_contents # Members from Parent:
self.namespace = namespace #self.namespace
#self.forward_api_contents
# Raw Contents #self.forward_api_name
self.forward_api_name = "" #self.orig_forward_inputs_list
self.forward_args_str = "" #self.orig_forward_attrs_list
self.forward_returns_str = "" #self.orig_forward_returns_list
#self.forward_inputs_position_map
# Raw Data #self.forward_outputs_position_map
self.forward_attrs_list = None #[ [attr_name, attr_type, default_value, orig_position], ...] #self.optional_inputs
self.forward_inputs_list = None #[ [arg_name, arg_type, orig_position], ...] #self.no_need_buffers
self.forward_returns_list = None #[ [ret_name, ret_type, orig_position], ...] #self.intermediate_outputs
#self.inplace_map
# Processed Data FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
self.forward_inputs_position_map = None #{ "name" : [type, fwd_position] }
self.forward_outputs_position_map = None #{ "name" : [type, fwd_position] }
# Special Op Attributes
self.optional_inputs = [] #[name, ...]
self.is_forward_only = True self.is_forward_only = True
# Generated Results # Generated Results
self.python_c_function_str = "" self.python_c_function_str = ""
self.python_c_function_reg_str = "" self.python_c_function_reg_str = ""
def CollectRawContents(self):
fwd_api_contents = self.fwd_api_contents
assert 'api' in fwd_api_contents.keys(
), "Unable to find \"api\" in fwd_api_contents keys"
assert 'args' in fwd_api_contents.keys(
), "Unable to find \"args\" in fwd_api_contents keys"
assert 'output' in fwd_api_contents.keys(
), "Unable to find \"output\" in fwd_api_contents keys"
self.forward_api_name = fwd_api_contents['api']
self.forward_args_str = fwd_api_contents['args']
self.forward_returns_str = fwd_api_contents['output']
def CollectIsForwardOnly(self): def CollectIsForwardOnly(self):
fwd_api_contents = self.fwd_api_contents forward_api_contents = self.forward_api_contents
self.is_forward_only = False if 'backward' in fwd_api_contents.keys( self.is_forward_only = False if 'backward' in forward_api_contents.keys(
) else True ) else True
def CollectOptionalInputs(self): def GeneratePythonCFunction(self):
fwd_api_contents = self.fwd_api_contents
if 'optional' in fwd_api_contents.keys():
self.optional_inputs = ParseDispensable(fwd_api_contents[
'optional'])
def CollectForwardInOutAttr(self):
forward_args_str = self.forward_args_str
forward_returns_str = self.forward_returns_str
self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward(
forward_args_str, forward_returns_str)
def CollectForwardPositionMap(self):
forward_inputs_list = self.forward_inputs_list
forward_returns_list = self.forward_returns_list
self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap(
forward_inputs_list, forward_returns_list)
def GeneratePythonCFunction(self, inplace_map):
namespace = self.namespace namespace = self.namespace
forward_api_name = GetInplacedFunctionName( inplace_map = self.inplace_map
self.forward_api_name) if inplace_map else self.forward_api_name forward_api_name = self.forward_api_name
forward_attrs_list = self.forward_attrs_list orig_forward_attrs_list = self.orig_forward_attrs_list
forward_inputs_position_map = self.forward_inputs_position_map forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map forward_outputs_position_map = self.forward_outputs_position_map
optional_inputs = self.optional_inputs optional_inputs = self.optional_inputs
...@@ -326,7 +294,7 @@ class PythonCSingleFunctionGenerator: ...@@ -326,7 +294,7 @@ class PythonCSingleFunctionGenerator:
parse_attributes_str = "" parse_attributes_str = ""
# Generate Python-C Attributes Parsing Logic # Generate Python-C Attributes Parsing Logic
for name, atype, _, pos in forward_attrs_list: for name, atype, _, pos in orig_forward_attrs_list:
parsing_function_name = FindParsingFunctionFromAttributeType(atype) parsing_function_name = FindParsingFunctionFromAttributeType(atype)
parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format( parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format(
name, pos, atype, name, parsing_function_name, name, name, pos, atype, name, parsing_function_name, name,
...@@ -334,11 +302,11 @@ class PythonCSingleFunctionGenerator: ...@@ -334,11 +302,11 @@ class PythonCSingleFunctionGenerator:
# Generate Dygraph Function Call Logic # Generate Dygraph Function Call Logic
num_args = len(forward_inputs_position_map.keys()) + len( num_args = len(forward_inputs_position_map.keys()) + len(
forward_attrs_list) orig_forward_attrs_list)
dygraph_function_call_list = ["" for i in range(num_args)] dygraph_function_call_list = ["" for i in range(num_args)]
for name, (_, pos) in forward_inputs_position_map.items(): for name, (_, pos) in forward_inputs_position_map.items():
dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_list[pos] = f"{name}"
for name, _, _, pos in forward_attrs_list: for name, _, _, pos in orig_forward_attrs_list:
dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_list[pos] = f"{name}"
dygraph_function_call_str = ",".join(dygraph_function_call_list) dygraph_function_call_str = ",".join(dygraph_function_call_list)
...@@ -350,16 +318,6 @@ class PythonCSingleFunctionGenerator: ...@@ -350,16 +318,6 @@ class PythonCSingleFunctionGenerator:
fwd_function_name = FUNCTION_NAME_TEMPLATE.format( fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
"::", namespace, GetForwardFunctionName(forward_api_name)) "::", namespace, GetForwardFunctionName(forward_api_name))
if inplace_map:
assert len(
inplace_map
) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
for inplace_input, inplace_output in inplace_map.items():
return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
forward_api_name, inplace_input, forward_api_name,
inplace_output)
break
else:
return_str = " return ToPyObject(out);" return_str = " return ToPyObject(out);"
# Generate Record Event for performance profiling # Generate Record Event for performance profiling
...@@ -374,29 +332,56 @@ class PythonCSingleFunctionGenerator: ...@@ -374,29 +332,56 @@ class PythonCSingleFunctionGenerator:
self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format( self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
forward_api_name, namespace, forward_api_name, forward_api_name) forward_api_name, namespace, forward_api_name, forward_api_name)
def run(self, inplace_map): if len(inplace_map) > 0:
inplaced_forward_api_name = GetInplacedFunctionName(
self.forward_api_name)
assert len(
inplace_map
) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
for inplace_input, inplace_output in inplace_map.items():
return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
inplaced_forward_api_name, inplace_input,
inplaced_forward_api_name, inplace_output)
break
self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format(
inplaced_forward_api_name, pythonc_record_event_str,
inplaced_forward_api_name, get_eager_tensor_str,
parse_attributes_str, fwd_function_name,
dygraph_function_call_str, return_str)
# Generate Python-C Function Registration
self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format(
inplaced_forward_api_name, namespace, inplaced_forward_api_name,
inplaced_forward_api_name)
def run(self):
# Initialized is_forward_only # Initialized is_forward_only
self.CollectIsForwardOnly() self.CollectIsForwardOnly()
# Initialized forward_api_name, forward_args_str, forward_returns_str
self.CollectRawContents()
if SkipAPIGeneration(self.forward_api_name): return False
# Initialized optional_inputs # Initialized optional_inputs
self.CollectOptionalInputs() self.ParseDispensable()
# Initialized inplace_map
self.ParseInplaceInfo()
# Initialized forward_inputs_list, forward_returns_list, forward_attrs_list # Initialized orig_forward_inputs_list, orig_forward_returns_list, orig_forward_attrs_list
self.CollectForwardInOutAttr() self.CollectOriginalForwardInfo()
logging.info( logging.info(
f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}") f"Parsed Original Forward Inputs List: \n{self.orig_forward_inputs_list}"
)
logging.info( logging.info(
f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}") f"Prased Original Forward Attrs List: \n{self.orig_forward_attrs_list}"
)
logging.info( logging.info(
f"Parsed Original Forward Returns List: \n{self.forward_returns_list}" f"Parsed Original Forward Returns List: \n{self.orig_forward_returns_list}"
) )
if SkipAPIGeneration(self.forward_api_name): return False
# Initialized forward_inputs_position_map, forward_outputs_position_map # Initialized forward_inputs_position_map, forward_outputs_position_map
self.CollectForwardPositionMap() self.DetermineForwardPositionMap(self.orig_forward_inputs_list,
self.orig_forward_returns_list)
logging.info( logging.info(
f"Generated Forward Input Position Map: {self.forward_inputs_position_map}" f"Generated Forward Input Position Map: {self.forward_inputs_position_map}"
) )
...@@ -405,7 +390,7 @@ class PythonCSingleFunctionGenerator: ...@@ -405,7 +390,7 @@ class PythonCSingleFunctionGenerator:
) )
# Code Generation # Code Generation
self.GeneratePythonCFunction(inplace_map) self.GeneratePythonCFunction()
logging.info( logging.info(
f"Generated Python-C Function: {self.python_c_function_str}") f"Generated Python-C Function: {self.python_c_function_str}")
logging.info( logging.info(
...@@ -415,21 +400,18 @@ class PythonCSingleFunctionGenerator: ...@@ -415,21 +400,18 @@ class PythonCSingleFunctionGenerator:
return True return True
class PythonCYamlGenerator: class PythonCYamlGenerator(YamlGeneratorBase):
def __init__(self, path): def __init__(self, path):
self.yaml_path = path # Parent members:
# self.namespace
self.namespace = "" # self.api_yaml_path
self.forward_api_list = [] # self.forward_api_list
YamlGeneratorBase.__init__(self, api_yaml_path)
# Generated Result # Generated Result
self.python_c_functions_reg_str = "" self.python_c_functions_reg_str = ""
self.python_c_functions_str = "" self.python_c_functions_str = ""
def ParseYamlContents(self):
yaml_path = self.yaml_path
self.forward_api_list = ReadFwdFile(yaml_path)
def GeneratePythonCFunctions(self): def GeneratePythonCFunctions(self):
namespace = self.namespace namespace = self.namespace
forward_api_list = self.forward_api_list forward_api_list = self.forward_api_list
...@@ -437,28 +419,12 @@ class PythonCYamlGenerator: ...@@ -437,28 +419,12 @@ class PythonCYamlGenerator:
for forward_api_content in forward_api_list: for forward_api_content in forward_api_list:
f_generator = PythonCSingleFunctionGenerator(forward_api_content, f_generator = PythonCSingleFunctionGenerator(forward_api_content,
namespace) namespace)
status = f_generator.run({}) status = f_generator.run()
if status == True: if status == True:
self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n" self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
self.python_c_functions_str += f_generator.python_c_function_str + "\n" self.python_c_functions_str += f_generator.python_c_function_str + "\n"
if 'inplace' in forward_api_content.keys():
inplace_map = ParseInplaceInfo(forward_api_content['inplace'])
f_generator_inplace = PythonCSingleFunctionGenerator(
forward_api_content, namespace)
status = f_generator_inplace.run(inplace_map)
if status == True:
self.python_c_functions_reg_str += f_generator_inplace.python_c_function_reg_str + ",\n"
self.python_c_functions_str += f_generator_inplace.python_c_function_str + "\n"
def InferNameSpace(self):
yaml_path = self.yaml_path
if "sparse" in yaml_path:
self.namespace = "sparse::"
def AttachNamespace(self): def AttachNamespace(self):
namespace = self.namespace namespace = self.namespace
python_c_functions_str = self.python_c_functions_str python_c_functions_str = self.python_c_functions_str
...@@ -474,7 +440,7 @@ class PythonCYamlGenerator: ...@@ -474,7 +440,7 @@ class PythonCYamlGenerator:
self.InferNameSpace() self.InferNameSpace()
# Read Yaml file # Read Yaml file
self.ParseYamlContents() self.ParseForwardYamlContents()
# Code Generation # Code Generation
self.GeneratePythonCFunctions() self.GeneratePythonCFunctions()
......
...@@ -51,8 +51,7 @@ static std::vector<std::string> GetTensorsName( ...@@ -51,8 +51,7 @@ static std::vector<std::string> GetTensorsName(
} }
static void CheckInputVarStatus(const Tensor &tensor) { static void CheckInputVarStatus(const Tensor &tensor) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(tensor.defined() && tensor.is_dense_tensor(), true,
tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"The input tensor %s of " "The input tensor %s of "
"RunProgram(Grad)Op holds " "RunProgram(Grad)Op holds "
...@@ -74,7 +73,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, ...@@ -74,7 +73,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"dst_tensor shall be defined.")); "dst_tensor shall be defined."));
if (phi::DenseTensor::classof(dst_tensor.impl().get())) { if (dst_tensor.is_dense_tensor()) {
auto &src_tensor = src_var.Get<phi::DenseTensor>(); auto &src_tensor = src_var.Get<phi::DenseTensor>();
PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true, PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
...@@ -88,7 +87,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, ...@@ -88,7 +87,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
"RunProgram(Grad)Op's internal " "RunProgram(Grad)Op's internal "
"scope is not initialized.", "scope is not initialized.",
name)); name));
} else if (phi::SelectedRows::classof(dst_tensor.impl().get())) { } else if (dst_tensor.is_selected_rows()) {
auto &src_tensor = src_var.Get<phi::SelectedRows>(); auto &src_tensor = src_var.Get<phi::SelectedRows>();
PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true, PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
...@@ -159,9 +158,6 @@ static void ShareTensorsFromScope( ...@@ -159,9 +158,6 @@ static void ShareTensorsFromScope(
name)); name));
CheckOutputVarStatus(*var, *tensors[i]); CheckOutputVarStatus(*var, *tensors[i]);
// share tensor // share tensor
// TODO(dev): Determine Tensor type by scope.var
// auto tensor_base = tensors[i]->impl();
// if (phi::DenseTensor::classof(tensor_base.get())) {
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
auto &src_tensor = var->Get<phi::DenseTensor>(); auto &src_tensor = var->Get<phi::DenseTensor>();
auto *dst_tensor = const_cast<phi::DenseTensor *>( auto *dst_tensor = const_cast<phi::DenseTensor *>(
...@@ -169,7 +165,6 @@ static void ShareTensorsFromScope( ...@@ -169,7 +165,6 @@ static void ShareTensorsFromScope(
VLOG(2) << "share " << name << " from scope"; VLOG(2) << "share " << name << " from scope";
*dst_tensor = src_tensor; *dst_tensor = src_tensor;
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
// } else if (phi::SelectedRows::classof(tensor_base.get())) {
auto &src_tensor = var->Get<phi::SelectedRows>(); auto &src_tensor = var->Get<phi::SelectedRows>();
auto *dst_tensor = const_cast<phi::SelectedRows *>( auto *dst_tensor = const_cast<phi::SelectedRows *>(
dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get())); dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
...@@ -202,7 +197,6 @@ inline void RunProgramAPI( ...@@ -202,7 +197,6 @@ inline void RunProgramAPI(
"The OutScope of RunProgramGradOp should only hold one scope.")); "The OutScope of RunProgramGradOp should only hold one scope."));
// Step 2. prepare executor and init persistable variables // Step 2. prepare executor and init persistable variables
// NOTE(Aurelius84): While training some models, forward can be called many // NOTE(Aurelius84): While training some models, forward can be called many
// times and then apply backpropagation all at once, such as Reinforcement // times and then apply backpropagation all at once, such as Reinforcement
// Learning. Tensor data in multi-step training should be saved into single // Learning. Tensor data in multi-step training should be saved into single
...@@ -277,11 +271,6 @@ inline void RunProgramGradAPI( ...@@ -277,11 +271,6 @@ inline void RunProgramGradAPI(
// if all output vars are set to stop_gradient, grad op no need to executed // if all output vars are set to stop_gradient, grad op no need to executed
if (x_grad.empty() && params_grad.empty()) return; if (x_grad.empty() && params_grad.empty()) return;
// TODO(dev): Remove this line hard code. And need to deal with the out_grad
// name problem.
// const_cast<paddle::experimental::Tensor &>(out_grad[0])
// .set_name("matmul_v2_0.tmp_0@GRAD");
auto *global_block = auto *global_block =
BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
...@@ -381,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -381,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
VLOG(3) << "out_grads[0].size() : " << grads[0].size(); VLOG(3) << "out_grads[0].size() : " << grads[0].size();
std::vector<paddle::experimental::Tensor> x_grad; std::vector<paddle::experimental::Tensor> x_grad;
std::vector<paddle::experimental::Tensor> params_grad; std::vector<paddle::experimental::Tensor> params_grad;
ConstructGradTensors(x_, &x_grad); ConstructXGradTensors(x_, &x_grad);
ConstructGradTensors(params_, &params_grad); ConstructParamGradTensors(params_, &params_grad);
std::vector<paddle::experimental::Tensor *> x_grad_ptr; std::vector<paddle::experimental::Tensor *> x_grad_ptr;
std::vector<paddle::experimental::Tensor *> params_grad_ptr; std::vector<paddle::experimental::Tensor *> params_grad_ptr;
for (auto &i : x_grad) { for (auto &i : x_grad) {
...@@ -392,9 +381,6 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -392,9 +381,6 @@ class GradNodeRunProgram : public egr::GradNodeBase {
params_grad_ptr.emplace_back(&i); params_grad_ptr.emplace_back(&i);
} }
// auto x_grad_ptr = ConstructGradTensors(x_);
// auto params_grad_ptr = ConstructGradTensors(params_);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
grads[0].size(), fwd_out_names_.size(), grads[0].size(), fwd_out_names_.size(),
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
...@@ -412,7 +398,6 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -412,7 +398,6 @@ class GradNodeRunProgram : public egr::GradNodeBase {
params_grad_ptr); params_grad_ptr);
VLOG(3) << "End Eager Backward Node: GradNodeRunProgram"; VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
return {x_grad, params_grad}; return {x_grad, params_grad};
// return {x_grad, details::DereferenceTensors(params_grad_ptr)};
} }
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
...@@ -447,29 +432,35 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -447,29 +432,35 @@ class GradNodeRunProgram : public egr::GradNodeBase {
} }
protected: protected:
void ConstructGradTensors( void ConstructXGradTensors(
const std::vector<paddle::experimental::Tensor> &fwd_tensors, const std::vector<paddle::experimental::Tensor> &x,
std::vector<paddle::experimental::Tensor> *grad_tensors) { std::vector<paddle::experimental::Tensor> *x_grad) {
// TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
// such as: name, tensor type(DenseTensor or SelectedRows). // such as: name, tensor type(DenseTensor or SelectedRows).
VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); for (auto &t : x) {
for (auto &fwd_t : fwd_tensors) { if (t.is_dense_tensor()) {
if (phi::DenseTensor::classof(fwd_t.impl().get())) { x_grad->emplace_back(std::make_shared<phi::DenseTensor>());
grad_tensors->emplace_back(std::make_shared<phi::DenseTensor>()); } else if (t.is_selected_rows()) {
} else if (phi::SelectedRows::classof(fwd_t.impl().get())) { x_grad->emplace_back(std::make_shared<phi::SelectedRows>());
grad_tensors->emplace_back(std::make_shared<phi::SelectedRows>()); }
} x_grad->back().set_name(t.name() + "@GRAD");
auto &grad_t = grad_tensors->back(); }
grad_t.set_name(fwd_t.name() + "@GRAD"); }
}
} void ConstructParamGradTensors(
const std::vector<paddle::experimental::Tensor> &param,
void ConstructGradTensors( std::vector<paddle::experimental::Tensor> *param_grad) {
const std::vector<paddle::experimental::Tensor> &fwd_tensors) { for (auto &t : param) {
VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); auto t_meta = egr::EagerUtils::unsafe_autograd_meta(t);
for (auto &fwd_t : fwd_tensors) { auto t_grad = egr::EagerUtils::unsafe_autograd_meta(t)->Grad();
auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad(); if (t_meta->StopGradient()) {
grad_tesnor.set_name(fwd_t.name() + "@GRAD"); param_grad->emplace_back();
} else if (t_grad.is_dense_tensor()) {
param_grad->emplace_back(std::make_shared<phi::DenseTensor>());
} else if (t_grad.is_selected_rows()) {
param_grad->emplace_back(std::make_shared<phi::SelectedRows>());
}
param_grad->back().set_name(t.name() + "@GRAD");
} }
} }
......
...@@ -271,6 +271,7 @@ void EagerUtils::GetOutput(const std::shared_ptr<EagerVariable>& out, ...@@ -271,6 +271,7 @@ void EagerUtils::GetOutput(const std::shared_ptr<EagerVariable>& out,
"shared_ptr, this error may indicate some outputs " "shared_ptr, this error may indicate some outputs "
"are nullptr")); "are nullptr"));
out_var->set_impl(out->GetTensorBase()); out_var->set_impl(out->GetTensorBase());
out_var->set_name(out->name());
} }
void EagerUtils::GetOutputs( void EagerUtils::GetOutputs(
......
...@@ -13,6 +13,9 @@ IF(WITH_GPU) ...@@ -13,6 +13,9 @@ IF(WITH_GPU)
nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps)
#nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS graph_gpu_ps)
# ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu)
# target_link_libraries(test_sample_rate graph_gpu_ps)
ENDIF() ENDIF()
IF(WITH_ROCM) IF(WITH_ROCM)
hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
......
...@@ -93,14 +93,17 @@ node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 ...@@ -93,14 +93,17 @@ node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
struct NeighborSampleResult { struct NeighborSampleResult {
int64_t *val; int64_t *val;
int *actual_sample_size, sample_size, key_size; int *actual_sample_size, sample_size, key_size;
int *offset;
NeighborSampleResult(int _sample_size, int _key_size) NeighborSampleResult(int _sample_size, int _key_size)
: sample_size(_sample_size), key_size(_key_size) { : sample_size(_sample_size), key_size(_key_size) {
actual_sample_size = NULL; actual_sample_size = NULL;
val = NULL; val = NULL;
offset = NULL;
}; };
~NeighborSampleResult() { ~NeighborSampleResult() {
if (val != NULL) cudaFree(val); if (val != NULL) cudaFree(val);
if (actual_sample_size != NULL) cudaFree(actual_sample_size); if (actual_sample_size != NULL) cudaFree(actual_sample_size);
if (offset != NULL) cudaFree(offset);
} }
}; };
......
...@@ -71,10 +71,10 @@ TEST(TEST_FLEET, graph_sample) { ...@@ -71,10 +71,10 @@ TEST(TEST_FLEET, graph_sample) {
*/ */
::paddle::distributed::GraphParameter table_proto; ::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(true); table_proto.set_gpups_mode(true);
table_proto.set_gpups_mode_shard_num(127); table_proto.set_shard_num(127);
table_proto.set_gpu_num(3); table_proto.set_gpu_num(3);
table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler"); table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
table_proto.set_gpups_graph_sample_args("5,5,1,1"); table_proto.set_gpups_graph_sample_args("100,5,5,1,1");
prepare_file(edge_file_name, edges); prepare_file(edge_file_name, edges);
g.init_cpu_table(table_proto); g.init_cpu_table(table_proto);
g.load(std::string(edge_file_name), std::string("e>")); g.load(std::string(edge_file_name), std::string("e>"));
...@@ -93,16 +93,53 @@ TEST(TEST_FLEET, graph_sample) { ...@@ -93,16 +93,53 @@ TEST(TEST_FLEET, graph_sample) {
cudaMalloc((void **)&key, 3 * sizeof(int64_t)); cudaMalloc((void **)&key, 3 * sizeof(int64_t));
cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
int64_t *res = new int64_t[9]; int64_t *res = new int64_t[7];
cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); /*
cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost);
std::sort(res, res + 3); std::sort(res, res + 3);
std::sort(res + 6, res + 9); std::sort(res + 4, res + 7);
int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; //int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
for (int i = 0; i < 9; i++) { int64_t expected_sample_val[] = {28, 29, 30, 0, 21, 22, 23};
for (int i = 0; i < 7; i++) {
VLOG(0)<<i<<" "<<res[i];
if (expected_sample_val[i] != -1) { if (expected_sample_val[i] != -1) {
ASSERT_EQ(res[i], expected_sample_val[i]); ASSERT_EQ(res[i], expected_sample_val[i]);
} }
} }
delete[] res; delete[] res;
delete neighbor_sample_res; delete neighbor_sample_res;
*/
cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost);
int *actual_sample_size = new int[3];
cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size, 12,
cudaMemcpyDeviceToHost); // 3, 1, 3
int *cumsum_sample_size = new int[3];
cudaMemcpy(cumsum_sample_size, neighbor_sample_res->offset, 12,
cudaMemcpyDeviceToHost); // 0, 3, 4
std::vector<std::vector<int64_t>> neighbors_;
std::vector<int64_t> neighbors_7 = {28, 29, 30, 31, 32, 33, 34, 35};
std::vector<int64_t> neighbors_0 = {0};
std::vector<int64_t> neighbors_6 = {21, 22, 23, 24, 25, 26, 27};
neighbors_.push_back(neighbors_7);
neighbors_.push_back(neighbors_0);
neighbors_.push_back(neighbors_6);
for (int i = 0; i < 3; i++) {
for (int j = cumsum_sample_size[i];
j < cumsum_sample_size[i] + actual_sample_size[i]; j++) {
bool flag = false;
for (int k = 0; k < neighbors_[i].size(); k++) {
if (res[j] == neighbors_[i][k]) {
flag = true;
break;
}
}
ASSERT_EQ(flag, true);
}
}
delete[] res;
delete[] actual_sample_size;
delete[] cumsum_sample_size;
delete neighbor_sample_res;
} }
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <unistd.h>
#include <condition_variable> // NOLINT
#include <fstream>
#include <iomanip>
#include <string>
#include <thread> // NOLINT
#include <unordered_set>
#include <vector>
#include "google/protobuf/text_format.h"
#include <chrono>
#include "gtest/gtest.h"
#include "paddle/fluid/distributed/ps.pb.h"
#include "paddle/fluid/distributed/ps/service/env.h"
#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
using namespace paddle::framework;
namespace platform = paddle::platform;
namespace operators = paddle::operators;
namespace memory = paddle::memory;
namespace distributed = paddle::distributed;
std::string input_file;
int fixed_key_size = 100, sample_size = 100,
bfs_sample_nodes_in_each_shard = 10000, init_search_size = 1,
bfs_sample_edges = 20;
std::vector<std::string> edges = {
std::string("37\t45\t0.34"), std::string("37\t145\t0.31"),
std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
std::string("59\t45\t0.34"), std::string("59\t145\t0.31"),
std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
// odd id:96 48 122 112
char edge_file_name[] = "test_edges.txt";
void prepare_file(char file_name[], std::vector<std::string> data) {
std::ofstream ofile;
ofile.open(file_name);
for (auto x : data) {
ofile << x << std::endl;
}
ofile.close();
}
void testSampleRate() {
#ifdef PADDLE_WITH_HETERPS
std::vector<int64_t> ids;
int start = 0;
pthread_rwlock_t rwlock;
pthread_rwlock_init(&rwlock, NULL);
{
::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(false);
table_proto.set_shard_num(127);
table_proto.set_task_pool_size(24);
std::cerr << "initializing begin";
distributed::GraphTable graph_table;
graph_table.initialize(table_proto);
std::cerr << "initializing done";
graph_table.load(input_file, std::string("e>"));
int sample_actual_size = -1;
int step = fixed_key_size, cur = 0;
while (sample_actual_size != 0) {
std::unique_ptr<char[]> buffer;
graph_table.pull_graph_list(cur, step, buffer, sample_actual_size, false,
1);
int index = 0;
while (index < sample_actual_size) {
paddle::distributed::FeatureNode node;
node.recover_from_buffer(buffer.get() + index);
index += node.get_size(false);
// res.push_back(node);
ids.push_back(node.get_id());
int swap_pos = rand() % ids.size();
std::swap(ids[swap_pos], ids[(int)ids.size() - 1]);
}
cur = ids.size();
// if (sample_actual_size == 0) break;
// char *buff = buffer.get();
// for (int i = 0; i < sample_actual_size/sizeof(int64_t); i++) {
// ids.push_back(*((int64_t *)buff + i));
// int swap_pos = rand() % ids.size();
// std::swap(ids[swap_pos], ids[(int)ids.size() - 1]);
// }
// cur += sample_actual_size/sizeof(int64_t);
}
std::cerr << "load ids done" << std::endl;
std::vector<int64_t> sample_id[10], sample_neighbors[10];
std::vector<int> actual_size[10];
auto func = [&rwlock, &graph_table, &ids, &sample_id, &actual_size,
&sample_neighbors, &start](int i) {
while (true) {
int s, sn;
bool exit = false;
pthread_rwlock_wrlock(&rwlock);
if (start < ids.size()) {
s = start;
sn = ids.size() - start;
sn = min(sn, fixed_key_size);
start += sn;
} else {
exit = true;
}
pthread_rwlock_unlock(&rwlock);
if (exit) break;
std::vector<std::shared_ptr<char>> buffers(sn);
std::vector<int> ac(sn);
auto status = graph_table.random_sample_neighbors(
ids.data() + s, sample_size, buffers, ac, false);
for (int j = s; j < s + sn; j++) {
sample_id[i].push_back(ids[j]);
actual_size[i].push_back(ac[j - s] / sizeof(int64_t));
int ss = ac[j - s] / sizeof(int64_t);
for (int k = 0; k < ss; k++) {
sample_neighbors[i].push_back(
*((int64_t *)(buffers[j - s].get() + k * sizeof(int64_t))));
}
}
}
VLOG(0) << "func " << i << " returns ";
};
auto start1 = std::chrono::steady_clock::now();
std::thread thr[10];
for (int i = 0; i < 10; i++) {
thr[i] = std::thread(func, i);
}
for (int i = 0; i < 10; i++) thr[i].join();
auto end1 = std::chrono::steady_clock::now();
auto tt =
std::chrono::duration_cast<std::chrono::microseconds>(end1 - start1);
std::cerr << "total time cost without cache is " << tt.count() << " us"
<< std::endl;
}
const int gpu_num = 8;
::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(true);
table_proto.set_shard_num(127);
table_proto.set_gpu_num(gpu_num);
table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
table_proto.set_gpups_graph_sample_args(std::to_string(init_search_size) +
",100000000,10000000,1,1");
std::vector<int> dev_ids;
for (int i = 0; i < gpu_num; i++) {
dev_ids.push_back(i);
}
std::shared_ptr<HeterPsResource> resource =
std::make_shared<HeterPsResource>(dev_ids);
resource->enable_p2p();
GpuPsGraphTable g(resource);
g.init_cpu_table(table_proto);
g.load(std::string(input_file), std::string("e>"));
NodeQueryResult *query_node_res;
query_node_res = g.query_node_list(0, 0, ids.size() + 10000);
VLOG(0) << "gpu got " << query_node_res->actual_sample_size << " nodes ";
VLOG(0) << "cpu got " << ids.size() << " nodes";
ASSERT_EQ((int)query_node_res->actual_sample_size, (int)ids.size());
int64_t *gpu_node_res = new int64_t[ids.size()];
cudaMemcpy(gpu_node_res, query_node_res->val, ids.size() * sizeof(int64_t),
cudaMemcpyDeviceToHost);
std::unordered_set<int64_t> cpu_node_set, gpu_node_set;
for (auto x : ids) {
cpu_node_set.insert(x);
}
for (int i = 0; i < (int)query_node_res->actual_sample_size; i++) {
auto x = gpu_node_res[i];
ASSERT_EQ(cpu_node_set.find(x) != cpu_node_set.end(), true);
gpu_node_set.insert(x);
}
VLOG(0) << " cpu_node_size = " << cpu_node_set.size();
VLOG(0) << " gpu_node_size = " << gpu_node_set.size();
ASSERT_EQ(cpu_node_set.size(), gpu_node_set.size());
for (int i = 0; i < 20; i++) {
int st = ids.size() / 20 * i;
auto q = g.query_node_list(0, st, ids.size() / 20);
VLOG(0) << " the " << i << "th iteration size = " << q->actual_sample_size;
}
// NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
/*
void *key;
cudaMalloc((void **)&key, ids.size() * sizeof(int64_t));
cudaMemcpy(key, ids.data(), ids.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
std::vector<NeighborSampleResult *> res[gpu_num];
start = 0;
auto func = [&rwlock, &g, &res, &start,
&gpu_num, &ids, &key](int i) {
while (true) {
int s, sn;
bool exit = false;
pthread_rwlock_wrlock(&rwlock);
if (start < ids.size()) {
s = start;
sn = ids.size() - start;
sn = min(sn, fixed_key_size);
start += sn;
} else {
exit = true;
}
pthread_rwlock_unlock(&rwlock);
if (exit) break;
auto r =
g.graph_neighbor_sample(i, (int64_t *)(key + s), sample_size, sn);
res[i].push_back(r);
}
};
auto start1 = std::chrono::steady_clock::now();
std::thread thr[gpu_num];
for (int i = 0; i < gpu_num; i++) {
thr[i] = std::thread(func, i);
}
for (int i = 0; i < gpu_num; i++) thr[i].join();
auto end1 = std::chrono::steady_clock::now();
auto tt =
std::chrono::duration_cast<std::chrono::microseconds>(end1 - start1);
std::cerr << "total time cost without cache is " << tt.count() << " us"
<< std::endl;
*/
#endif
}
// TEST(testSampleRate, Run) { testSampleRate(); }
int main(int argc, char *argv[]) {
for (int i = 0; i < argc; i++)
VLOG(0) << "Argument " << i << " is " << std::string(argv[i]);
if (argc > 1) {
input_file = argv[1];
} else {
prepare_file(edge_file_name, edges);
input_file = edge_file_name;
}
VLOG(0) << "input_file is " << input_file;
if (argc > 2) {
fixed_key_size = std::stoi(argv[2]);
}
VLOG(0) << "sample_node_size for every batch is " << fixed_key_size;
if (argc > 3) {
sample_size = std::stoi(argv[3]);
}
VLOG(0) << "sample_size neighbor_size is " << sample_size;
if (argc > 4) init_search_size = std::stoi(argv[4]);
VLOG(0) << " init_search_size " << init_search_size;
testSampleRate();
}
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP(cinn_launch); USE_OP(cinn_launch);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
namespace paddle::framework { namespace paddle::framework {
......
...@@ -234,10 +234,26 @@ void InterpreterCore::Convert( ...@@ -234,10 +234,26 @@ void InterpreterCore::Convert(
gc_check_input_list.erase(last, gc_check_input_list.end()); gc_check_input_list.erase(last, gc_check_input_list.end());
for (auto var_id : gc_check_input_list) { for (auto var_id : gc_check_input_list) {
paddle::framework::Variable* var = global_scope_->Var(var_id);
if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
var->IsType<LoDTensorArray>()) {
vec_meta_info[var_id].var_ref_count_++; vec_meta_info[var_id].var_ref_count_++;
// TODO(zhiqiu): not all var needs to be checked, var need to be checked
// only
// after the last_live_op. For example,
// b = op1(a)
// c = op2(a, b)
// in this case, a is the input of op1 and op2, we only need to check
// a after op2, because op2 always uses a after op1.
instr.AddGCCheckVar(var_id); instr.AddGCCheckVar(var_id);
VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after " VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after "
<< instr.OpBase()->Type(); << instr.OpBase()->Type();
} else {
VLOG(4) << "not clear " << global_scope_->GetNameById(var_id)
<< " after " << instr.OpBase()->Type()
<< " because its type is "
<< framework::ToTypeName(var->Type());
}
} }
} }
......
...@@ -674,7 +674,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { ...@@ -674,7 +674,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
} // namespace paddle } // namespace paddle
USE_PASS(build_cinn_pass); USE_PASS(build_cinn_pass);
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP_ITSELF(relu); USE_OP_ITSELF(relu);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(relu_grad); USE_OP_ITSELF(relu_grad);
......
...@@ -300,6 +300,6 @@ TEST(CinnCompilerTest, Compile) { ...@@ -300,6 +300,6 @@ TEST(CinnCompilerTest, Compile) {
USE_PASS(build_cinn_pass); USE_PASS(build_cinn_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP_ITSELF(relu); USE_OP_ITSELF(relu);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
...@@ -98,4 +98,4 @@ TEST(test_var_helper, eager_var_helper) { ...@@ -98,4 +98,4 @@ TEST(test_var_helper, eager_var_helper) {
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
...@@ -28,6 +28,8 @@ ...@@ -28,6 +28,8 @@
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
namespace platform = paddle::platform; namespace platform = paddle::platform;
namespace framework = paddle::framework; namespace framework = paddle::framework;
...@@ -267,7 +269,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) { ...@@ -267,7 +269,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) {
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP(mul_grad); USE_OP_ITSELF(mul_grad);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(elementwise_add_grad); USE_OP_ITSELF(elementwise_add_grad);
...@@ -416,4 +416,4 @@ TEST(test_layer, test_eager) { ...@@ -416,4 +416,4 @@ TEST(test_layer, test_eager) {
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
...@@ -34,9 +34,13 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); ...@@ -34,9 +34,13 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten_grad, GPU, ALL_LAYOUT);
#endif #endif
namespace imperative = paddle::imperative; namespace imperative = paddle::imperative;
...@@ -598,8 +602,8 @@ TEST(test_tracer, eager_tracer) { ...@@ -598,8 +602,8 @@ TEST(test_tracer, eager_tracer) {
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
USE_OP(mul_grad); USE_OP_ITSELF(mul_grad);
USE_OP_ITSELF(reduce_sum); USE_OP_ITSELF(reduce_sum);
USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
...@@ -43,4 +43,4 @@ TEST(fc_op, test) { ...@@ -43,4 +43,4 @@ TEST(fc_op, test) {
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
...@@ -46,4 +46,4 @@ TEST(MulOpConverter, main) { ...@@ -46,4 +46,4 @@ TEST(MulOpConverter, main) {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP_ITSELF(mul);
...@@ -65,9 +65,10 @@ class MeanCUDAKernel : public framework::OpKernel<T> { ...@@ -65,9 +65,10 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
for (decltype(rank) i = 0; i < rank; ++i) { for (decltype(rank) i = 0; i < rank; ++i) {
reduce_dims.push_back(i); reduce_dims.push_back(i);
} }
TensorReduceImpl<T, T, kernel_primitives::AddFunctor, Div>( TensorReduceImpl<T, T, kernel_primitives::AddFunctor,
context.cuda_device_context(), *input, output, Div(numel), reduce_dims, kps::IdentityFunctor<T>>(
stream); context.cuda_device_context(), *input, output,
kps::IdentityFunctor<T>(), reduce_dims, stream, true);
} }
}; };
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/mul_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/platform/mkldnn_reuse.h"
namespace phi { namespace phi {
...@@ -46,6 +46,9 @@ using dnnl::memory; ...@@ -46,6 +46,9 @@ using dnnl::memory;
using dnnl::prop_kind; using dnnl::prop_kind;
using dnnl::stream; using dnnl::stream;
constexpr int kMULMKLDNNINT8 = 1;
constexpr int kMULMKLDNNFP32 = 2;
template <typename XT, typename YT, typename OT> template <typename XT, typename YT, typename OT>
class MulPrimitiveFactory { class MulPrimitiveFactory {
public: public:
......
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/mul_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
...@@ -27,6 +27,9 @@ namespace operators { ...@@ -27,6 +27,9 @@ namespace operators {
using framework::OpKernelType; using framework::OpKernelType;
using framework::Tensor; using framework::Tensor;
constexpr int kMULMKLDNNINT8 = 1;
constexpr int kMULMKLDNNFP32 = 2;
class MulOp : public framework::OperatorWithKernel { class MulOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -354,16 +357,3 @@ REGISTER_OPERATOR(mul_grad, ops::MulGradOp, ...@@ -354,16 +357,3 @@ REGISTER_OPERATOR(mul_grad, ops::MulGradOp,
ops::MulDoubleGradMaker<paddle::imperative::OpBase>); ops::MulDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp); REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp);
REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
mul_grad_grad,
ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, double>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>,
ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
mul_grad_grad,
ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
constexpr int kMULMKLDNNINT8 = 1;
constexpr int kMULMKLDNNFP32 = 2;
template <typename DeviceContext, typename T>
class MulKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* x = context.Input<Tensor>("X");
const Tensor* y = context.Input<Tensor>("Y");
Tensor* z = context.Output<Tensor>("Out");
const Tensor x_matrix =
x->dims().size() > 2
? framework::ReshapeToMatrix(
*x, context.template Attr<int>("x_num_col_dims"))
: *x;
const Tensor y_matrix =
y->dims().size() > 2
? framework::ReshapeToMatrix(
*y, context.template Attr<int>("y_num_col_dims"))
: *y;
z->mutable_data<T>(context.GetPlace());
auto z_dim = z->dims();
if (z_dim.size() != 2) {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
blas.MatMul(x_matrix, y_matrix, z);
if (z_dim.size() != 2) {
z->Resize(z_dim);
}
}
};
template <typename DeviceContext, typename T>
class MulGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto x_matrix = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, x_num_col_dims)
: static_cast<const Tensor&>(*x);
auto y_matrix = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, y_num_col_dims)
: static_cast<const Tensor&>(*y);
auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
Tensor dout_mat;
dout_mat.ShareDataWith(*dout);
dout_mat.Resize({phi::flatten_to_2d(x->dims(), x_num_col_dims)[0],
phi::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
if (dx != nullptr) {
dx->set_lod(x->lod());
}
if (dy != nullptr) {
dy->set_lod(y->lod());
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_matrix = dx->dims().size() > 2
? framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
Tensor dy_matrix = dy->dims().size() > 2
? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
}
}
};
template <typename DeviceContext, typename T>
class MulDoubleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto x_mat = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, x_num_col_dims)
: static_cast<const Tensor&>(*x);
auto y_mat = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, y_num_col_dims)
: static_cast<const Tensor&>(*y);
const int m = phi::flatten_to_2d(x->dims(), x_num_col_dims)[0];
const int n = phi::flatten_to_2d(y->dims(), y_num_col_dims)[1];
auto* dout = ctx.Input<framework::LoDTensor>("DOut");
Tensor dout_mat;
dout_mat.ShareDataWith(*dout);
dout_mat.Resize({m, n});
auto* ddx = ctx.Input<framework::LoDTensor>("DDX");
auto* ddy = ctx.Input<framework::LoDTensor>("DDY");
auto* dx = ctx.Output<framework::LoDTensor>("DX");
auto* dy = ctx.Output<framework::LoDTensor>("DY");
auto* ddout = ctx.Output<framework::LoDTensor>("DDOut");
Tensor ddout_mat;
if (ddout) {
ddout->set_lod(dout->lod());
// allocate and reshape ddout
ddout->mutable_data<T>(ctx.GetPlace());
ddout_mat.ShareDataWith(*ddout);
ddout_mat.Resize({m, n});
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
// a flag to specify whether ddout value has been set, if flag
// is false, MatMul beta should be 0 to set ddout, if flag is
// true, MatMul beta should be 1 to add result to ddout.
bool ddout_flag = false;
if (ddx) {
auto ddx_mat = ddx->dims().size() > 2
? framework::ReshapeToMatrix(*ddx, x_num_col_dims)
: static_cast<const Tensor&>(*ddx);
// dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
if (dy) {
dy->set_lod(y->lod());
// allocate and reshape dy
dy->mutable_data<T>(ctx.GetPlace());
Tensor dy_mat = dy->dims().size() > 2
? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
}
// ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
if (ddout) {
blas.MatMul(ddx_mat, false, y_mat, false, static_cast<T>(1.0),
&ddout_mat, static_cast<T>(ddout_flag));
ddout_flag = true;
}
}
if (ddy) {
auto ddy_mat = ddy->dims().size() > 2
? framework::ReshapeToMatrix(*ddy, y_num_col_dims)
: static_cast<const Tensor&>(*ddy);
// dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
if (dx) {
dx->set_lod(x->lod());
// allocate and reshape dx
dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_mat = dx->dims().size() > 2
? framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
}
// ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
if (ddout) {
blas.MatMul(x_mat, false, ddy_mat, false, static_cast<T>(1.0),
&ddout_mat, static_cast<T>(ddout_flag));
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/operators/mul_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle { namespace paddle {
......
...@@ -14,11 +14,11 @@ limitations under the License. */ ...@@ -14,11 +14,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/mul_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,8 +14,13 @@ limitations under the License. */ ...@@ -14,8 +14,13 @@ limitations under the License. */
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -25,44 +30,6 @@ class MultiplexOp : public framework::OperatorWithKernel { ...@@ -25,44 +30,6 @@ class MultiplexOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "Multiplex");
PADDLE_ENFORCE_NE(
ctx->Inputs("X").empty(), true,
platform::errors::InvalidArgument("MultiInput(X) shouldn't be empty."));
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multiplex");
auto ids_dim = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(
ids_dim.size(), 2,
platform::errors::PreconditionNotMet(
"The index tensor must be a vector with 2 dimensions"));
PADDLE_ENFORCE_EQ(
ids_dim[1], 1,
platform::errors::PreconditionNotMet(
"The index tensor must be a vector with batchSize x 1."));
auto ins_dims = ctx->GetInputsDim("X");
auto num_ins = ins_dims.size();
PADDLE_ENFORCE_GT(num_ins, 1,
platform::errors::InvalidArgument(
"multiplex operator should have more than "
"one candidate input tensors."));
auto in_dim = ins_dims[0];
PADDLE_ENFORCE_GE(
in_dim.size(), 2,
platform::errors::InvalidArgument(
"The rank of candidate tensors must be not less than 2."));
for (size_t i = 1; i < num_ins; i++) {
auto dim = ins_dims[i];
PADDLE_ENFORCE_EQ(
in_dim, dim,
platform::errors::PreconditionNotMet(
"All the candidate tensors must have the same size."));
}
ctx->SetOutputDim("Out", in_dim);
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -164,8 +131,11 @@ class MultiplexGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -164,8 +131,11 @@ class MultiplexGradMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(multiplex, MultiplexInferShapeFunctor,
PD_INFER_META(phi::MultiplexInferMeta));
REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
ops::MultiplexGradMaker<paddle::framework::OpDesc>, ops::MultiplexGradMaker<paddle::framework::OpDesc>,
ops::MultiplexGradMaker<paddle::imperative::OpBase>); ops::MultiplexGradMaker<paddle::imperative::OpBase>,
MultiplexInferShapeFunctor);
REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
...@@ -21,6 +21,10 @@ ...@@ -21,6 +21,10 @@
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -29,43 +33,6 @@ using DDim = framework::DDim; ...@@ -29,43 +33,6 @@ using DDim = framework::DDim;
class QrOp : public framework::OperatorWithKernel { class QrOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "qr");
OP_INOUT_CHECK(ctx->HasOutput("Q"), "Output", "Q", "qr");
OP_INOUT_CHECK(ctx->HasOutput("R"), "Output", "R", "qr");
auto x_dims = ctx->GetInputDim("X");
int x_rank = x_dims.size();
PADDLE_ENFORCE_GE(x_dims.size(), 2,
platform::errors::InvalidArgument(
"the rank of input must greater than 2"));
bool compute_q;
bool reduced_mode;
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
std::string mode = ctx->Attrs().Get<std::string>("mode");
std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
if (compute_q) {
int k = reduced_mode ? min_mn : m;
auto q_dims_vec = phi::vectorize(x_dims);
q_dims_vec[q_dims_vec.size() - 1] = k;
ctx->SetOutputDim("Q", phi::make_ddim(q_dims_vec));
} else {
ctx->SetOutputDim("Q", phi::make_ddim({0}));
}
int k = reduced_mode ? min_mn : m;
auto r_dims_vec = phi::vectorize(x_dims);
r_dims_vec[r_dims_vec.size() - 2] = k;
r_dims_vec[r_dims_vec.size() - 1] = n;
ctx->SetOutputDim("R", phi::make_ddim(r_dims_vec));
ctx->ShareLoD("X", /*->*/ "Q");
ctx->ShareLoD("X", /*->*/ "R");
}
}; };
class QrOpMaker : public framework::OpProtoAndCheckerMaker { class QrOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -83,10 +50,8 @@ class QrOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -83,10 +50,8 @@ class QrOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault("reduced"); .SetDefault("reduced");
AddComment(R"DOC( AddComment(R"DOC(
Qr Operator. Qr Operator.
This operator is used to perform QR operation for batched matrics $X$. This operator is used to perform QR operation for batched matrics $X$.
$$Q, R = qr(X)$$ $$Q, R = qr(X)$$
)DOC"); )DOC");
} }
}; };
...@@ -138,10 +103,13 @@ class QrGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -138,10 +103,13 @@ class QrGradMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(qr, QrInferShapeFunctor,
PD_INFER_META(phi::QrInferMeta));
REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
ops::QrGradMaker<paddle::framework::OpDesc>, ops::QrGradMaker<paddle::framework::OpDesc>,
ops::QrGradMaker<paddle::imperative::OpBase>); ops::QrGradMaker<paddle::imperative::OpBase>,
QrInferShapeFunctor);
REGISTER_OPERATOR(qr_grad, ops::QrGradOp); REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
......
...@@ -33,12 +33,12 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, ...@@ -33,12 +33,12 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y, const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims, const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) { gpuStream_t stream, bool is_mean = false) {
y->mutable_data<Ty>(x.place()); y->mutable_data<Ty>(x.place());
phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>( phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>(
static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform, static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform,
origin_reduce_dims); origin_reduce_dims, is_mean);
} }
} // namespace operators } // namespace operators
......
...@@ -13,29 +13,18 @@ See the License for the specific language governing permissions and ...@@ -13,29 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <memory> #include <memory>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class TrilTriuOp : public framework::OperatorWithKernel { class TrilTriuOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(
ctx->HasInput("X"), true,
platform::errors::NotFound("Input(X) of TrilTriuOp is not found."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("Out"), true,
platform::errors::NotFound("Output(Out) of TrilTriuOp is not found."));
const auto& x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_GE(x_dims.size(), 2,
platform::errors::InvalidArgument(
"Input(X)'s rank must be at least 2 in TrilTriuOp."));
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
}; };
class TrilTriuOpMaker : public framework::OpProtoAndCheckerMaker { class TrilTriuOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -100,7 +89,10 @@ class TrilTriuGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -100,7 +89,10 @@ class TrilTriuGradOpMaker : public framework::SingleGradOpMaker<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
DECLARE_INFER_SHAPE_FUNCTOR(tril_triu, TrilTriuInferShapeFunctor,
PD_INFER_META(phi::TrilTriuInferMeta));
REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>, ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>); ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>,
TrilTriuInferShapeFunctor);
REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <iostream> #include <iostream>
#include "paddle/phi/core/enforce.h"
static PyObject *eager_api_run_program(PyObject *self, PyObject *args, static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
PyObject *kwargs) { PyObject *kwargs) {
...@@ -33,13 +34,24 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args, ...@@ -33,13 +34,24 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs); run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
PyEval_RestoreThread(tstate); PyEval_RestoreThread(tstate);
tstate = nullptr; tstate = nullptr;
Py_RETURN_NONE;
} catch (paddle::platform::EnforceNotMet &exception) {
if (tstate) {
PyEval_RestoreThread(tstate);
}
std::ostringstream sout;
sout << exception.what();
sout << " [operator < run_program > error]";
exception.set_error_str(sout.str());
ThrowExceptionToPython(std::current_exception());
return nullptr;
} catch (...) { } catch (...) {
if (tstate) { if (tstate) {
PyEval_RestoreThread(tstate); PyEval_RestoreThread(tstate);
} }
ThrowExceptionToPython(std::current_exception()); ThrowExceptionToPython(std::current_exception());
return nullptr;
} }
Py_RETURN_NONE;
} }
static PyMethodDef CustomEagerFinalStateMethods[] = { static PyMethodDef CustomEagerFinalStateMethods[] = {
......
...@@ -40,6 +40,9 @@ limitations under the License. */ ...@@ -40,6 +40,9 @@ limitations under the License. */
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
...@@ -468,6 +471,90 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, ...@@ -468,6 +471,90 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* eager_api_sparse_coo_tensor(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto non_zero_indices = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 1), 1);
auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 2), 2);
auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
PADDLE_ENFORCE(non_zero_indices.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero indices must be a DenseTensor."));
PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero elements must be a DenseTensor."));
auto dense_indices =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_indices.impl());
auto dense_elements =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_elements.impl());
// TODO(zhangkaihuo): After create SparseTensor, call coalesced() to sort and
// merge duplicate indices
std::shared_ptr<phi::SparseCooTensor> coo_tensor =
std::make_shared<phi::SparseCooTensor>(*dense_indices, *dense_elements,
phi::make_ddim(dense_shape));
paddle::experimental::Tensor tensor;
tensor.set_impl(coo_tensor);
auto name =
egr::Controller::Instance().GenerateUniqueName("generated_tensor");
tensor.set_name(name);
auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
if (!autograd_meta->GetMutableGradNode()) {
VLOG(3) << "Tensor(" << name
<< ") have not GradNode, add GradNodeAccumulation for it.";
autograd_meta->SetGradNode(
std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
}
return ToPyObject(tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* eager_api_sparse_csr_tensor(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto non_zero_crows = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
auto non_zero_cols = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 1), 1);
auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 2), 2);
auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 3), 3);
auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
PADDLE_ENFORCE(non_zero_crows.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the compressed non-zero rows must be a DenseTensor."));
PADDLE_ENFORCE(non_zero_cols.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero cols must be a DenseTensor."));
PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero elements must be a DenseTensor."));
auto dense_crows =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_crows.impl());
auto dense_cols =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_cols.impl());
auto dense_elements =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_elements.impl());
std::shared_ptr<phi::SparseCsrTensor> csr_tensor =
std::make_shared<phi::SparseCsrTensor>(*dense_crows, *dense_cols,
*dense_elements,
phi::make_ddim(dense_shape));
paddle::experimental::Tensor tensor;
tensor.set_impl(csr_tensor);
auto name =
egr::Controller::Instance().GenerateUniqueName("generated_tensor");
tensor.set_name(name);
auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
if (!autograd_meta->GetMutableGradNode()) {
VLOG(3) << "Tensor(" << name
<< ") have not GradNode, add GradNodeAccumulation for it.";
autograd_meta->SetGradNode(
std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
}
return ToPyObject(tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyMethodDef variable_functions[] = { PyMethodDef variable_functions[] = {
// TODO(jiabin): Remove scale when we have final state tests // TODO(jiabin): Remove scale when we have final state tests
{"scale", (PyCFunction)(void (*)(void))eager_api_scale, {"scale", (PyCFunction)(void (*)(void))eager_api_scale,
...@@ -490,6 +577,14 @@ PyMethodDef variable_functions[] = { ...@@ -490,6 +577,14 @@ PyMethodDef variable_functions[] = {
{"read_next_tensor_list", {"read_next_tensor_list",
(PyCFunction)(void (*)(void))eager_api_read_next_tensor_list, (PyCFunction)(void (*)(void))eager_api_read_next_tensor_list,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
/**sparse functions**/
{"sparse_coo_tensor",
(PyCFunction)(void (*)(void))eager_api_sparse_coo_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
{"sparse_csr_tensor",
(PyCFunction)(void (*)(void))eager_api_sparse_csr_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
/**sparse functions**/
{NULL, NULL, 0, NULL}}; {NULL, NULL, 0, NULL}};
void BindFunctions(PyObject* module) { void BindFunctions(PyObject* module) {
......
...@@ -959,11 +959,11 @@ static PyObject* tensor__set_grad_type(TensorObject* self, PyObject* args, ...@@ -959,11 +959,11 @@ static PyObject* tensor__set_grad_type(TensorObject* self, PyObject* args,
EAGER_TRY EAGER_TRY
auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0); auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0);
auto grad_tensor = auto grad_tensor =
egr::EagerUtils::unsafe_autograd_meta(self->tensor)->Grad(); egr::EagerUtils::unsafe_autograd_meta(self->tensor)->MutableGrad();
if (var_type == framework::proto::VarType::LOD_TENSOR) { if (var_type == framework::proto::VarType::LOD_TENSOR) {
grad_tensor.set_impl(std::make_shared<phi::DenseTensor>()); grad_tensor->set_impl(std::make_shared<phi::DenseTensor>());
} else if (var_type == framework::proto::VarType::SELECTED_ROWS) { } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
grad_tensor.set_impl(std::make_shared<phi::SelectedRows>()); grad_tensor->set_impl(std::make_shared<phi::SelectedRows>());
} }
return Py_None; return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
...@@ -1097,6 +1097,49 @@ static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args, ...@@ -1097,6 +1097,49 @@ static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* tensor_method_to_sparse_coo(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
int64_t sparse_dim = CastPyArg2AttrLong(PyTuple_GET_ITEM(args, 0), 0);
auto coo_tensor = self->tensor.to_sparse_coo(sparse_dim);
egr::EagerUtils::autograd_meta(&coo_tensor)
->SetStopGradient(
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient());
egr::EagerUtils::autograd_meta(&coo_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(coo_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_to_sparse_csr(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto csr_tensor = self->tensor.to_sparse_csr();
egr::EagerUtils::autograd_meta(&csr_tensor)
->SetStopGradient(
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient());
egr::EagerUtils::autograd_meta(&csr_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(csr_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_to_dense(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto dense_tensor = self->tensor.to_dense();
egr::EagerUtils::autograd_meta(&dense_tensor)
->SetStopGradient(
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient());
egr::EagerUtils::autograd_meta(&dense_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(dense_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args, static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
...@@ -1185,6 +1228,12 @@ PyMethodDef variable_methods[] = { ...@@ -1185,6 +1228,12 @@ PyMethodDef variable_methods[] = {
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr, {"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"to_sparse_coo", (PyCFunction)(void (*)(void))tensor_method_to_sparse_coo,
METH_VARARGS | METH_KEYWORDS, NULL},
{"to_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_to_sparse_csr,
METH_VARARGS | METH_KEYWORDS, NULL},
{"to_dense", (PyCFunction)(void (*)(void))tensor_method_to_dense,
METH_VARARGS | METH_KEYWORDS, NULL},
/***the method of sparse tensor****/ /***the method of sparse tensor****/
{"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version, {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
......
...@@ -33,19 +33,21 @@ namespace tensorrt { ...@@ -33,19 +33,21 @@ namespace tensorrt {
static nvinfer1::IBuilder* createInferBuilder( static nvinfer1::IBuilder* createInferBuilder(
nvinfer1::ILogger& logger) { // NOLINT nvinfer1::ILogger& logger) { // NOLINT
return static_cast<nvinfer1::IBuilder*>( return static_cast<nvinfer1::IBuilder*>(
phi::dynload::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); ::phi::dynload::createInferBuilder_INTERNAL(&logger,
NV_TENSORRT_VERSION));
} }
static nvinfer1::IRuntime* createInferRuntime( static nvinfer1::IRuntime* createInferRuntime(
nvinfer1::ILogger& logger) { // NOLINT nvinfer1::ILogger& logger) { // NOLINT
return static_cast<nvinfer1::IRuntime*>( return static_cast<nvinfer1::IRuntime*>(
phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); ::phi::dynload::createInferRuntime_INTERNAL(&logger,
NV_TENSORRT_VERSION));
} }
TrtEngine::TrtEngine(int device_id) : device_id_(device_id) { TrtEngine::TrtEngine(int device_id) : device_id_(device_id) {
FreshDeviceId(); FreshDeviceId();
logger_.reset(new TrtLogger()); logger_.reset(new TrtLogger());
builder_.reset(createInferBuilder(logger_->GetTrtLogger())); builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), ""); ::phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
} }
nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() { nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() {
...@@ -237,11 +239,11 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build, ...@@ -237,11 +239,11 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
} }
void TrtEngine::PrepareOutputHandle(const std::string& out_name) { void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
phi::DenseTensor t; ::phi::DenseTensor t;
outputs_.emplace(out_name, t); outputs_.emplace(out_name, t);
} }
phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) { ::phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
return &outputs_[name]; return &outputs_[name];
} }
...@@ -249,7 +251,7 @@ size_t TrtEngine::GetOutputNum() const { return outputs_.size(); } ...@@ -249,7 +251,7 @@ size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
bool TrtEngine::SetUpInference( bool TrtEngine::SetUpInference(
const InferenceOptions& inference, const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs) { const std::unordered_map<std::string, ::phi::DenseTensor*>& inputs) {
// TODO(wilber): now only create one exec_context // TODO(wilber): now only create one exec_context
FreshDeviceId(); FreshDeviceId();
CHECK(engine_ != nullptr); CHECK(engine_ != nullptr);
...@@ -272,7 +274,7 @@ bool TrtEngine::SetUpInference( ...@@ -272,7 +274,7 @@ bool TrtEngine::SetUpInference(
return true; return true;
} }
void TrtEngine::Run(const phi::GPUContext& ctx) { void TrtEngine::Run(const ::phi::GPUContext& ctx) {
if (is_dynamic_shape_) { if (is_dynamic_shape_) {
DynamicRun(ctx); DynamicRun(ctx);
} else { } else {
...@@ -280,7 +282,7 @@ void TrtEngine::Run(const phi::GPUContext& ctx) { ...@@ -280,7 +282,7 @@ void TrtEngine::Run(const phi::GPUContext& ctx) {
} }
} }
void TrtEngine::StaticRun(const phi::GPUContext& ctx) { void TrtEngine::StaticRun(const ::phi::GPUContext& ctx) {
const int num_bindings = engine_->getNbBindings(); const int num_bindings = engine_->getNbBindings();
std::vector<void*> buffers(num_bindings, nullptr); std::vector<void*> buffers(num_bindings, nullptr);
...@@ -291,7 +293,8 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { ...@@ -291,7 +293,8 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
buffers[bind_index] = buffers[bind_index] =
const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>())); const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>()));
if (runtime_batch != -1) { if (runtime_batch != -1) {
CHECK_EQ(runtime_batch, phi::vectorize<int64_t>(bind.buffer->dims())[0]); CHECK_EQ(runtime_batch,
::phi::vectorize<int64_t>(bind.buffer->dims())[0]);
} }
runtime_batch = bind.buffer->dims()[0]; runtime_batch = bind.buffer->dims()[0];
} }
...@@ -306,7 +309,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { ...@@ -306,7 +309,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
for (int i = 0; i < dims.nbDims; ++i) { for (int i = 0; i < dims.nbDims; ++i) {
ddim.push_back(dims.d[i]); ddim.push_back(dims.d[i]);
} }
bind.buffer->Resize(phi::make_ddim(ddim)); bind.buffer->Resize(::phi::make_ddim(ddim));
// TODO(wilber): now only support float output. // TODO(wilber): now only support float output.
ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel()); ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>()); buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
...@@ -316,7 +319,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { ...@@ -316,7 +319,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
runtime_batch, buffers.data(), ctx.stream(), nullptr); runtime_batch, buffers.data(), ctx.stream(), nullptr);
} }
void TrtEngine::DynamicRun(const phi::GPUContext& ctx) { void TrtEngine::DynamicRun(const ::phi::GPUContext& ctx) {
const int num_bindings = engine_->getNbBindings(); const int num_bindings = engine_->getNbBindings();
std::vector<void*> buffers(num_bindings, nullptr); std::vector<void*> buffers(num_bindings, nullptr);
...@@ -344,7 +347,7 @@ void TrtEngine::DynamicRun(const phi::GPUContext& ctx) { ...@@ -344,7 +347,7 @@ void TrtEngine::DynamicRun(const phi::GPUContext& ctx) {
for (int i = 0; i < dims.nbDims; ++i) { for (int i = 0; i < dims.nbDims; ++i) {
ddim[i] = dims.d[i]; ddim[i] = dims.d[i];
} }
bind.buffer->Resize(phi::make_ddim(ddim)); bind.buffer->Resize(::phi::make_ddim(ddim));
ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel()); ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>()); buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
} }
...@@ -356,7 +359,7 @@ void TrtEngine::FreshDeviceId() { ...@@ -356,7 +359,7 @@ void TrtEngine::FreshDeviceId() {
int count; int count;
cudaGetDeviceCount(&count); cudaGetDeviceCount(&count);
CHECK_LT(device_id_, count); CHECK_LT(device_id_, count);
phi::backends::gpu::SetDeviceId(device_id_); ::phi::backends::gpu::SetDeviceId(device_id_);
} }
void TrtEngine::GetEngineInfo() { void TrtEngine::GetEngineInfo() {
......
...@@ -76,19 +76,19 @@ class TrtEngine { ...@@ -76,19 +76,19 @@ class TrtEngine {
const BuildOptions& build_options); const BuildOptions& build_options);
// TODO(wilber): Modify signature after infrt-trt ready. // TODO(wilber): Modify signature after infrt-trt ready.
void Run(const phi::GPUContext& ctx); void Run(const ::phi::GPUContext& ctx);
// TODO(wilber): How to support multiple execution contexts? // TODO(wilber): How to support multiple execution contexts?
bool SetUpInference( bool SetUpInference(
const InferenceOptions& inference, const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs); const std::unordered_map<std::string, ::phi::DenseTensor*>& inputs);
void GetEngineInfo(); void GetEngineInfo();
void PrepareOutputHandle(const std::string& out_name); void PrepareOutputHandle(const std::string& out_name);
// TODO(wilber): The output tensor names are: output_0, output_1, ... // TODO(wilber): The output tensor names are: output_0, output_1, ...
phi::DenseTensor* GetOutput(const std::string&); ::phi::DenseTensor* GetOutput(const std::string&);
size_t GetOutputNum() const; size_t GetOutputNum() const;
...@@ -104,9 +104,9 @@ class TrtEngine { ...@@ -104,9 +104,9 @@ class TrtEngine {
bool ModelToBuildEnv(TrtUniquePtr<nvinfer1::INetworkDefinition> network, bool ModelToBuildEnv(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
const BuildOptions& build); const BuildOptions& build);
void StaticRun(const phi::GPUContext& ctx); void StaticRun(const ::phi::GPUContext& ctx);
void DynamicRun(const phi::GPUContext& ctx); void DynamicRun(const ::phi::GPUContext& ctx);
private: private:
std::unique_ptr<TrtLogger> logger_{nullptr}; std::unique_ptr<TrtLogger> logger_{nullptr};
...@@ -118,7 +118,7 @@ class TrtEngine { ...@@ -118,7 +118,7 @@ class TrtEngine {
std::vector<std::unique_ptr<Bindings>> bindings_; std::vector<std::unique_ptr<Bindings>> bindings_;
int device_id_{0}; int device_id_{0};
bool is_dynamic_shape_{false}; bool is_dynamic_shape_{false};
std::unordered_map<std::string, phi::DenseTensor> outputs_; std::unordered_map<std::string, ::phi::DenseTensor> outputs_;
}; };
} // namespace tensorrt } // namespace tensorrt
......
...@@ -92,7 +92,7 @@ class TrtLogger : public nvinfer1::ILogger { ...@@ -92,7 +92,7 @@ class TrtLogger : public nvinfer1::ILogger {
struct Binding { struct Binding {
bool is_input{false}; bool is_input{false};
nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT}; nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT};
phi::DenseTensor* buffer{nullptr}; ::phi::DenseTensor* buffer{nullptr};
std::string name; std::string name;
}; };
...@@ -103,7 +103,7 @@ class Bindings { ...@@ -103,7 +103,7 @@ class Bindings {
void AddBinding(int32_t b, void AddBinding(int32_t b,
const std::string& name, const std::string& name,
bool is_input, bool is_input,
phi::DenseTensor* buffer, ::phi::DenseTensor* buffer,
nvinfer1::DataType data_type) { nvinfer1::DataType data_type) {
while (bindings_.size() <= static_cast<size_t>(b)) { while (bindings_.size() <= static_cast<size_t>(b)) {
bindings_.emplace_back(); bindings_.emplace_back();
......
...@@ -97,4 +97,17 @@ def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { ...@@ -97,4 +97,17 @@ def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
let results = (outs DenseTensor:$output); let results = (outs DenseTensor:$output);
} }
// TODO(wilber): Add a infrt_gpu dialect.
def PDT_GpuMemCopyOp : PDT_Op<"memcpy.gpu", [NoSideEffect]> {
let summary = "phi_dt.gpu.memcpy";
let description = [{gpu memcpy d2h or h2d}];
// TODO(wilber): add context argument to support stream.
let arguments = (ins
DenseTensor:$input,
Context:$context,
BoolAttr:$d2h
);
let results = (outs DenseTensor:$output);
}
#endif #endif
...@@ -97,12 +97,13 @@ void PhiOpConvertPass::convertStage() { ...@@ -97,12 +97,13 @@ void PhiOpConvertPass::convertStage() {
} }
auto loc = getFunction().getLoc(); auto loc = getFunction().getLoc();
builder.setInsertionPoint(op); builder.setInsertionPoint(op);
if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) {
std::string kernel_name = phi::TransToPhiKernelName(op_name); if (!::phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_name)) {
op_name = phi::TransToPhiKernelName(op_name);
auto kernel_op = builder.create<infrt::KernelOp>(loc, auto kernel_op = builder.create<infrt::KernelOp>(loc,
op->getResultTypes(), op->getResultTypes(),
op->getOperands(), op->getOperands(),
kernel_name, op_name,
op->getAttrDictionary()); op->getAttrDictionary());
op->replaceAllUsesWith(kernel_op.getResults()); op->replaceAllUsesWith(kernel_op.getResults());
} else { } else {
......
...@@ -32,17 +32,24 @@ bool ProtoArgumentMappingContext::HasOutput(const std::string& name) const { ...@@ -32,17 +32,24 @@ bool ProtoArgumentMappingContext::HasOutput(const std::string& name) const {
} }
bool ProtoArgumentMappingContext::HasAttr(const std::string& name) const { bool ProtoArgumentMappingContext::HasAttr(const std::string& name) const {
if (name == "is_test") return true;
return op_->hasAttr(name); return op_->hasAttr(name);
} }
paddle::any ProtoArgumentMappingContext::Attr(const std::string& name) const { paddle::any ProtoArgumentMappingContext::Attr(const std::string& name) const {
mlir::Attribute attrs = op_->getAttr(name); if (name == "is_test") {
if (mlir::StringAttr str_attr = attrs.dyn_cast_or_null<mlir::StringAttr>()) { return paddle::any(true);
}
mlir::Attribute attr = op_->getAttr(name);
if (!attr) {
return paddle::any();
}
if (mlir::StringAttr str_attr = attr.dyn_cast<mlir::StringAttr>()) {
return paddle::any(str_attr.str()); return paddle::any(str_attr.str());
} else { }
// ToDO: implementation in the ext PR. // ToDO: implementation in the ext PR.
return paddle::any(0); return paddle::any(0);
}
} }
size_t ProtoArgumentMappingContext::InputSize(const std::string& name) const { size_t ProtoArgumentMappingContext::InputSize(const std::string& name) const {
......
...@@ -6,6 +6,7 @@ gather_srcs(infrt_src SRCS ...@@ -6,6 +6,7 @@ gather_srcs(infrt_src SRCS
trt_op_teller_pass.cc trt_op_teller_pass.cc
trt_graph_fuse_pass.cc trt_graph_fuse_pass.cc
trt_graph_split_pass.cc trt_graph_split_pass.cc
trt_type_convert_pass.cc
) )
mlir_tablegen_on(trt_ops) mlir_tablegen_on(trt_ops)
mlir_add_rewriter(pd_lower_to_trt) mlir_add_rewriter(pd_lower_to_trt)
......
...@@ -21,6 +21,26 @@ ...@@ -21,6 +21,26 @@
#include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
#include "paddle/infrt/host_context/core_runtime.h"
#include "paddle/infrt/host_context/kernel_registry.h"
#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
#include "paddle/infrt/kernel/basic_kernels.h"
#include "paddle/infrt/kernel/control_flow_kernels.h"
#include "paddle/infrt/kernel/tensor_kernels.h"
#include "paddle/infrt/kernel/tensor_shape_kernels.h"
#include "paddle/infrt/kernel/test_kernels.h"
#include "paddle/infrt/kernel/tensorrt/registry.h"
#ifdef INFRT_WITH_PHI
#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
#include "paddle/infrt/kernel/phi/registry.h"
#endif
int main(int argc, char** argv) { int main(int argc, char** argv) {
static llvm::cl::opt<std::string> input_file( static llvm::cl::opt<std::string> input_file(
...@@ -33,6 +53,22 @@ int main(int argc, char** argv) { ...@@ -33,6 +53,22 @@ int main(int argc, char** argv) {
mlir::MLIRContext* context = infrt::Global::getMLIRContext(); mlir::MLIRContext* context = infrt::Global::getMLIRContext();
auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context); auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
infrt::host_context::KernelRegistry registry;
::infrt::kernel::RegisterBasicKernels(&registry);
::infrt::kernel::RegisterTestKernels(&registry);
::infrt::kernel::RegisterTensorShapeKernels(&registry);
::infrt::kernel::RegisterTensorKernels(&registry);
::infrt::kernel::RegisterControlFlowKernels(&registry);
#ifdef INFRT_WITH_PHI
::infrt::kernel::RegisterPhiKernels(&registry);
::infrt::kernel::RegisterInferShapeLaunchers(&registry);
#endif
#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
::infrt::kernel::RegisterTrtKernels(&registry);
#endif
context->loadAllAvailableDialects();
module->dump(); module->dump();
mlir::PassManager pm(context); mlir::PassManager pm(context);
...@@ -41,10 +77,12 @@ int main(int argc, char** argv) { ...@@ -41,10 +77,12 @@ int main(int argc, char** argv) {
trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphFusePass>()); trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphFusePass>());
trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphSplitPass>(1)); trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphSplitPass>(1));
trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpConverterPass>()); trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpConverterPass>());
trt_pass_manager.addPass(infrt::trt::createTrtTypeConvertPass());
if (mlir::failed(pm.run(*module))) { if (mlir::failed(pm.run(*module))) {
std::cout << "\npass failed!\n" << std::endl; std::cout << "\npass failed!\n" << std::endl;
return 4; return 4;
} }
module->dump(); module->dump();
::infrt::host_context::TestMlir(module.get(), &registry);
return 0; return 0;
} }
...@@ -12,10 +12,17 @@ ...@@ -12,10 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
#include <glog/logging.h>
#include <mlir/IR/Builders.h> #include <mlir/IR/Builders.h>
#include <mlir/Transforms/DialectConversion.h> #include <mlir/Transforms/DialectConversion.h>
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
namespace infrt { namespace infrt {
namespace trt { namespace trt {
...@@ -41,34 +48,34 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern { ...@@ -41,34 +48,34 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()), ::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()),
trt_inputs, trt_inputs,
true /*run_once*/); true /*run_once*/);
::mlir::Block *block = new ::mlir::Block; auto &block = create_engine_op.body().emplaceBlock();
block->getOperations().splice(block->begin(), block.getOperations().splice(block.begin(),
casted_op.getBody()->getOperations(), casted_op.getBody()->getOperations(),
casted_op.getBody()->begin(), casted_op.getBody()->begin(),
casted_op.getBody()->end()); casted_op.getBody()->end());
create_engine_op.body().push_back(block);
// trt.execute // trt.compute
// outputs ::llvm::SmallVector<::mlir::Value, 4> replace_values2;
::llvm::SmallVector<::mlir::Type, 4> execute_outputs_types; auto ctx_op = rewriter.create<::infrt::phi::CreateGPUContextOp>(
for (auto v : casted_op.getODSResults(0)) { ods_loc,
execute_outputs_types.push_back(v.getType()); infrt::phi::ContextType::get(rewriter.getContext(),
} infrt::TargetType::GPU));
// inputs auto compute_op = rewriter.create<EngineComputeOp>(
::mlir::SmallVector<::mlir::Value, 4> execute_inputs( ods_loc,
create_engine_op.getODSResults(0)); ::infrt::DenseTensorListType::get(rewriter.getContext()),
for (auto v : inputs) { create_engine_op.engine(),
execute_inputs.push_back(v); ctx_op.output());
} auto tensor_list_val = compute_op.outputs();
auto execute_op = rewriter.create<ExecuteOp>( for (size_t i = 0; i < casted_op.getNumResults(); ++i) {
ods_loc, execute_outputs_types, execute_inputs); auto res = casted_op->getResult(i);
auto int_attr = mlir::IntegerAttr::get(
::llvm::SmallVector<::mlir::Value, 4> replace_values; mlir::IntegerType::get(rewriter.getContext(), 32), i);
for (auto v : auto get_tensor_op = rewriter.create<::infrt::dt::TensorListGetTensorOp>(
::llvm::SmallVector<::mlir::Value, 4>{execute_op.getODSResults(0)}) { ods_loc, res.getType(), tensor_list_val, int_attr);
replace_values.push_back(v); replace_values2.push_back(get_tensor_op.output());
} }
rewriter.replaceOp(op, replace_values); ctx_op->moveBefore(ctx_op->getBlock(), ctx_op->getBlock()->begin());
rewriter.replaceOp(op, replace_values2);
return ::mlir::success(); return ::mlir::success();
} }
}; };
...@@ -82,6 +89,9 @@ void TRTOpConverterPass::runOnOperation() { ...@@ -82,6 +89,9 @@ void TRTOpConverterPass::runOnOperation() {
// this lowering. In our case, we are lowering to TensorRTDialect from // this lowering. In our case, we are lowering to TensorRTDialect from
// PaddleDialect // PaddleDialect
target.addLegalDialect<TensorRTDialect>(); target.addLegalDialect<TensorRTDialect>();
target.addLegalDialect<::infrt::phi::PHIDialect>();
target.addLegalDialect<::infrt::dt::DTDialect>();
target.addLegalDialect<phi::PHIDenseTensorDialect>();
// Now that the conversion target has been defined, we just need to provide // Now that the conversion target has been defined, we just need to provide
// the set of patterns that will lower the TensorRT operations. // the set of patterns that will lower the TensorRT operations.
......
...@@ -14,7 +14,9 @@ ...@@ -14,7 +14,9 @@
#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
#include <llvm/Support/Casting.h>
#include <mlir/IR/Builders.h> #include <mlir/IR/Builders.h>
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
...@@ -35,10 +37,12 @@ void TRTOpTellerPass::runOnFunction() { ...@@ -35,10 +37,12 @@ void TRTOpTellerPass::runOnFunction() {
auto *op = worklist.back(); auto *op = worklist.back();
worklist.pop_back(); worklist.pop_back();
if (op == nullptr) continue; if (op == nullptr) continue;
if (op->getName().getStringRef().substr(0, 3) != "pd.") continue;
if (::llvm::dyn_cast_or_null<infrt::pd::FeedOp>(op)) continue; if (::llvm::dyn_cast_or_null<infrt::pd::FeedOp>(op)) continue;
if (::llvm::dyn_cast_or_null<infrt::pd::FetchOp>(op)) continue; if (::llvm::dyn_cast_or_null<infrt::pd::FetchOp>(op)) continue;
if (::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(op)) continue; if (::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(op)) continue;
if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue; if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
builder.setInsertionPoint(op); builder.setInsertionPoint(op);
auto loc = getFunction().getLoc(); auto loc = getFunction().getLoc();
auto graph_op = builder.create<infrt::pd::GraphOp>( auto graph_op = builder.create<infrt::pd::GraphOp>(
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
#include <glog/logging.h>
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Casting.h"
#include "mlir/IR/Block.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/OperationSupport.h"
#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "paddle/infrt/dialect/infrt/common/types.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
namespace {
class TrtTypeConvertPass
: public mlir::PassWrapper<TrtTypeConvertPass, mlir::FunctionPass> {
public:
::llvm::StringRef getName() const override { return "TrtTypeConvertPass"; }
void runOnFunction() override;
};
void TrtTypeConvertPass::runOnFunction() {
mlir::Block& body = getFunction().front();
auto* mlir_ctx = getFunction()->getContext();
mlir::OpBuilder builder(&body, body.begin());
std::vector<mlir::Operation*> worklist;
mlir::Operation* ctx_op{nullptr};
worklist.reserve(body.getOperations().size());
for (auto& op : body) {
worklist.push_back(&op);
if (op.getName().getStringRef() == "phi_dt.create_context.gpu") {
ctx_op = &op;
}
}
::infrt::LayoutType layout = ::infrt::LayoutType::NCHW;
::infrt::TargetType target = ::infrt::TargetType::GPU;
for (auto& op : worklist) {
if (auto tensor_map_get_op =
llvm::dyn_cast<::infrt::phi::TensorMapGetTensorOp>(op)) {
auto res = tensor_map_get_op.output();
if (auto t = res.getType().dyn_cast<::infrt::DenseTensorType>()) {
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, t.getTarget(), t.getPrecision(), layout);
res.setType(replace_type);
}
}
if (auto create_engine = llvm::dyn_cast<::infrt::trt::CreateEngineOp>(op)) {
// Insert `infrt.gpu.memcpy` op.
for (auto arg : create_engine.getOperands()) {
if (mlir::Operation* producer = arg.getDefiningOp()) {
if (arg.getType().isa<::infrt::DenseTensorType>()) {
builder.setInsertionPointAfter(producer);
auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>();
if (producer->getName().getStringRef() !=
"phi_dt.tensor_map_get_tensor" &&
t.getTarget() != ::infrt::TargetType::GPU) {
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, target, t.getPrecision(), layout);
CHECK_NOTNULL(ctx_op);
auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>(
arg.getLoc(),
replace_type,
arg,
llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op)
.output(),
mlir::BoolAttr::get(mlir_ctx, /*d2h*/ false));
arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op);
}
}
} else {
auto blockArg = arg.cast<mlir::BlockArgument>();
if (arg.getType().isa<::infrt::DenseTensorType>()) {
auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>();
builder.setInsertionPointAfter(ctx_op);
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout);
CHECK_NOTNULL(ctx_op);
auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>(
blockArg.getLoc(),
replace_type,
blockArg,
llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op)
.output(),
mlir::BoolAttr::get(mlir_ctx, /*d2h*/ false));
arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op);
}
}
}
// Change ops(in block) types.
auto& block = create_engine.getRegion().getBlocks().front();
for (auto& op : block.without_terminator()) {
for (size_t i = 0; i < op.getNumResults(); ++i) {
if (auto t = op.getResult(i)
.getType()
.dyn_cast<::infrt::DenseTensorType>()) {
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout);
op.getResult(i).setType(replace_type);
}
}
}
} else if (auto list_get_tensor_op =
llvm::dyn_cast<::infrt::dt::TensorListGetTensorOp>(op)) {
auto result = list_get_tensor_op.output();
if (auto t = result.getType().dyn_cast<::infrt::DenseTensorType>()) {
result.setType(::infrt::DenseTensorType::get(
mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout));
}
} else if (auto return_op = llvm::dyn_cast<::infrt::ReturnOp>(op)) {
for (auto arg : return_op->getOperands()) {
if (auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>()) {
if (t.getLayout() != ::infrt::LayoutType::ANY ||
t.getTarget() != ::infrt::TargetType::CPU ||
t.getPrecision() != ::infrt::PrecisionType::FLOAT32) {
builder.setInsertionPoint(return_op);
CHECK_NOTNULL(ctx_op);
auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>(
return_op.getLoc(),
::infrt::DenseTensorType::get(mlir_ctx,
::infrt::TargetType::CPU,
t.getPrecision(),
::infrt::LayoutType::ANY),
arg,
llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op)
.output(),
mlir::BoolAttr::get(mlir_ctx, /*d2h*/ true));
arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op);
}
}
}
}
}
}
} // namespace
namespace infrt {
namespace trt {
std::unique_ptr<mlir::Pass> createTrtTypeConvertPass() {
return std::make_unique<TrtTypeConvertPass>();
}
} // namespace trt
} // namespace infrt
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mlir/Pass/Pass.h>
namespace infrt {
namespace trt {
std::unique_ptr<mlir::Pass> createTrtTypeConvertPass();
} // namespace trt
} // namespace infrt
...@@ -147,6 +147,7 @@ class Value : public common::Object { ...@@ -147,6 +147,7 @@ class Value : public common::Object {
#endif #endif
explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {} explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {} explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
explicit Value(::phi::MetaConfig&& x) : data(std::move(x)) {}
#ifdef INFRT_WITH_TRT #ifdef INFRT_WITH_TRT
explicit Value(::infrt::backends::tensorrt::TrtEngine&& x) explicit Value(::infrt::backends::tensorrt::TrtEngine&& x)
: data(std::move(x)) {} : data(std::move(x)) {}
......
...@@ -30,6 +30,7 @@ namespace phi { ...@@ -30,6 +30,7 @@ namespace phi {
::phi::GPUContext context; ::phi::GPUContext context;
context.PartialInitWithoutAllocator(); context.PartialInitWithoutAllocator();
context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{}); context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{});
context.SetHostAllocator(new backends::CpuPhiAllocator{});
context.PartialInitWithAllocator(); context.PartialInitWithAllocator();
return context; return context;
} }
......
此差异已折叠。
...@@ -7,3 +7,4 @@ add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle ...@@ -7,3 +7,4 @@ add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册