提交 7ab3f36e 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

......@@ -100,7 +100,6 @@ function(kernel_library TARGET)
set(xpu_srcs)
set(gpudnn_srcs)
set(kps_srcs)
set(selected_rows_srcs)
# parse and save the deps kerenl targets
set(all_srcs)
set(kernel_deps)
......@@ -112,6 +111,12 @@ function(kernel_library TARGET)
cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
# used for cc_library selected_rows dir target
set(target_suffix "")
if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows_kernel")
set(target_suffix "_sr")
endif()
list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
# one kernel only match one impl file in each backend
if (${kernel_library_SRCS_len} EQUAL 0)
......@@ -121,9 +126,6 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
list(APPEND selected_rows_srcs ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
endif()
if (WITH_GPU OR WITH_ROCM)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
......@@ -169,26 +171,46 @@ function(kernel_library TARGET)
list(APPEND all_srcs ${xpu_srcs})
list(APPEND all_srcs ${gpudnn_srcs})
list(APPEND all_srcs ${kps_srcs})
set(all_include_kernels)
set(all_kernel_name)
foreach(src ${all_srcs})
file(READ ${src} target_content)
# "kernels/xxx"(DenseTensor Kernel) can only include each other, but can't include "SUB_DIR/xxx" (such as selected_rows Kernel)
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
if ("${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
else()
list(APPEND all_include_kernels ${include_kernels})
# "SUB_DIR/xxx" can include "kernels/xx" and "SUB_DIR/xxx"
if (NOT "${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
list(APPEND all_include_kernels ${include_kernels})
endif()
foreach(include_kernel ${include_kernels})
foreach(include_kernel ${all_include_kernels})
if ("${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
else()
# NOTE(dev): we should firstly match kernel_library_SUB_DIR.
if (${include_kernel} MATCHES "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/")
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
endif()
# for selected_rows directory, add ${target_suffix}.
string(REGEX REPLACE ".h\"" "${target_suffix}" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
else()
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND kernel_deps ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
endif()
message(STATUS "${TARGET} DEPS ${all_kernel_name}")
endif()
list(APPEND kernel_deps ${all_kernel_name})
endforeach()
endforeach()
list(REMOVE_DUPLICATES kernel_deps)
list(REMOVE_ITEM kernel_deps ${TARGET})
list(REMOVE_ITEM kernel_deps ${TARGET}${target_suffix})
list(LENGTH common_srcs common_srcs_len)
list(LENGTH cpu_srcs cpu_srcs_len)
......@@ -196,92 +218,73 @@ function(kernel_library TARGET)
list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH gpudnn_srcs gpudnn_srcs_len)
list(LENGTH kps_srcs kps_srcs_len)
list(LENGTH selected_rows_srcs selected_rows_srcs_len)
# kernel source file level
# level 1: base device kernel
# - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs
# level 2: device-independent kernel
# - common_srcs
# level 3: Kernel implemented by reusing device-independent kernel
# - selected_rows_srcs
set(base_device_kernels)
set(device_independent_kernel)
set(high_level_kernels)
# 1. Base device kernel compile
if (${cpu_srcs_len} GREATER 0)
cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_cpu)
cc_library(${TARGET}_cpu${target_suffix} SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_cpu${target_suffix})
endif()
if (${gpu_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
list(APPEND base_device_kernels ${TARGET}_gpu)
list(APPEND base_device_kernels ${TARGET}_gpu${target_suffix})
endif()
if (${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_xpu)
cc_library(${TARGET}_xpu${target_suffix} SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_xpu${target_suffix})
endif()
if (${gpudnn_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
list(APPEND base_device_kernels ${TARGET}_gpudnn)
list(APPEND base_device_kernels ${TARGET}_gpudnn${target_suffix})
endif()
if (${kps_srcs_len} GREATER 0)
# only when WITH_XPU_KP, the kps_srcs_len can be > 0
xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_kps)
xpu_library(${TARGET}_kps${target_suffix} SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_kps${target_suffix})
endif()
# 2. Device-independent kernel compile
if (${common_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
nv_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_ROCM)
hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
hip_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
xpu_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
else()
cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
cc_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
endif()
list(APPEND device_independent_kernel ${TARGET}_common)
list(APPEND device_independent_kernel ${TARGET}_common${target_suffix})
endif()
# 3. Reusing kernel compile
if (${selected_rows_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_ROCM)
hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
else()
cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
endif()
list(APPEND high_level_kernels ${TARGET}_sr)
endif()
# 4. Unify target compile
# 3. Unify target compile
list(LENGTH base_device_kernels base_device_kernels_len)
list(LENGTH device_independent_kernel device_independent_kernel_len)
list(LENGTH high_level_kernels high_level_kernels_len)
if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR
${high_level_kernels_len} GREATER 0)
if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
nv_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_ROCM)
hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
hip_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
xpu_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
else()
cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
cc_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
endif()
else()
set(target_build_flag 0)
......@@ -290,10 +293,10 @@ function(kernel_library TARGET)
if (${target_build_flag} EQUAL 1)
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
${gpudnn_srcs_len} GREATER 0)
# append target into PHI_KERNELS property
get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
set(phi_kernels ${phi_kernels} ${TARGET})
set(phi_kernels ${phi_kernels} ${TARGET}${target_suffix})
set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels})
endif()
......@@ -318,9 +321,6 @@ function(kernel_library TARGET)
if (${kps_srcs_len} GREATER 0)
kernel_declare(${kps_srcs})
endif()
if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs})
endif()
endif()
endfunction()
......
......@@ -219,13 +219,13 @@ message GraphParameter {
optional string gpups_graph_sample_class = 3
[ default = "CompleteGraphSampler" ];
optional string gpups_graph_sample_args = 4 [ default = "" ];
optional bool use_cache = 5 [ default = true ];
optional float cache_ratio = 6 [ default = 0.3 ];
optional bool use_cache = 5 [ default = false ];
optional int32 cache_size_limit = 6 [ default = 100000 ];
optional int32 cache_ttl = 7 [ default = 5 ];
optional GraphFeature graph_feature = 8;
optional string table_name = 9 [ default = "" ];
optional string table_type = 10 [ default = "" ];
optional int32 gpups_mode_shard_num = 11 [ default = 127 ];
optional int32 shard_num = 11 [ default = 127 ];
optional int32 gpu_num = 12 [ default = 1 ];
}
......
......@@ -138,7 +138,6 @@ int BasicBfsGraphSampler::run_graph_sampling() {
int init_size = 0;
//__sync_fetch_and_add
std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int {
VLOG(0) << "in bfs " << i << " " << id;
if (this->status == GraphSamplerStatus::terminating) {
int task_left = __sync_sub_and_fetch(&task_size, 1);
if (task_left == 0) {
......@@ -148,13 +147,13 @@ int BasicBfsGraphSampler::run_graph_sampling() {
}
size_t ind = i % this->graph_table->task_pool_size_;
if (nodes_left[i] > 0) {
nodes_left[i]--;
auto iter = sample_neighbors_map[ind].find(id);
if (iter == sample_neighbors_map[ind].end()) {
sample_neighbors_map[ind][id] = std::vector<int64_t>();
iter = sample_neighbors_map[ind].find(id);
Node *node = graph_table->shards[i]->find_node(id);
if (node != NULL) {
nodes_left[i]--;
sample_neighbors_map[ind][id] = std::vector<int64_t>();
iter = sample_neighbors_map[ind].find(id);
size_t edge_fetch_size =
std::min((size_t) this->edge_num_for_each_node,
node->get_neighbor_size());
......@@ -179,11 +178,14 @@ int BasicBfsGraphSampler::run_graph_sampling() {
for (size_t i = 0; i < graph_table->shards.size(); ++i) {
std::vector<Node *> &v = graph_table->shards[i]->get_bucket();
if (v.size() > 0) {
int search_size = std::min(init_search_size, (int)v.size());
for (int k = 0; k < search_size; k++) {
init_size++;
__sync_add_and_fetch(&task_size, 1);
int64_t id = v[0]->get_id();
int64_t id = v[k]->get_id();
graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
->enqueue(bfs, i, id);
}
} // if
}
if (init_size == 0) {
......@@ -301,10 +303,11 @@ void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
std::vector<std::string> args) {
this->gpu_num = gpu_num;
this->graph_table = graph_table;
node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10;
edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10;
rounds = args.size() > 2 ? std::stoi(args[2]) : 1;
interval = args.size() > 3 ? std::stoi(args[3]) : 60;
init_search_size = args.size() > 0 ? std::stoi(args[0]) : 10;
node_num_for_each_shard = args.size() > 1 ? std::stoi(args[1]) : 10;
edge_num_for_each_node = args.size() > 2 ? std::stoi(args[2]) : 10;
rounds = args.size() > 3 ? std::stoi(args[3]) : 1;
interval = args.size() > 4 ? std::stoi(args[4]) : 60;
}
#endif
......@@ -1092,11 +1095,6 @@ int32_t GraphTable::initialize(const GraphParameter &graph) {
#ifdef PADDLE_WITH_HETERPS
if (graph.gpups_mode()) {
gpups_mode = true;
if (shard_num == 0) {
shard_num = graph.gpups_mode_shard_num();
server_num = 1;
_shard_idx = 0;
}
auto *sampler =
CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class());
auto slices =
......@@ -1107,7 +1105,18 @@ int32_t GraphTable::initialize(const GraphParameter &graph) {
graph_sampler.reset(sampler);
}
#endif
if (shard_num == 0) {
server_num = 1;
_shard_idx = 0;
shard_num = graph.shard_num();
}
task_pool_size_ = graph.task_pool_size();
use_cache = graph.use_cache();
if (use_cache) {
cache_size_limit = graph.cache_size_limit();
cache_ttl = graph.cache_ttl();
make_neighbor_sample_cache((size_t)cache_size_limit, (size_t)cache_ttl);
}
_shards_task_pool.resize(task_pool_size_);
for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
_shards_task_pool[i].reset(new ::ThreadPool(1));
......
......@@ -547,6 +547,8 @@ class GraphTable : public SparseTable {
std::unordered_set<int64_t> extra_nodes;
std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
bool use_cache, use_duplicate_nodes;
int cache_size_limit;
int cache_ttl;
mutable std::mutex mutex_;
std::shared_ptr<pthread_rwlock_t> rw_lock;
#ifdef PADDLE_WITH_HETERPS
......@@ -593,7 +595,7 @@ class BasicBfsGraphSampler : public GraphSampler {
std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
std::vector<std::vector<int64_t>> sample_neighbors;
size_t gpu_num;
int node_num_for_each_shard, edge_num_for_each_node;
int init_search_size, node_num_for_each_shard, edge_num_for_each_node;
int rounds, interval;
std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
sample_neighbors_map;
......
......@@ -456,7 +456,7 @@ void RunBrpcPushSparse() {
pull_status.wait();
ASSERT_EQ(_vs[0].size(), vs1[0].size());
for (int j = 0; j < _vs[0].size(); j++) {
for (size_t j = 0; j < _vs[0].size(); j++) {
ASSERT_EQ(_vs[0][j], vs1[0][j]);
}
}
......
......@@ -86,7 +86,7 @@ void testGraphSample() {
#ifdef PADDLE_WITH_HETERPS
::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(true);
table_proto.set_gpups_mode_shard_num(127);
table_proto.set_shard_num(127);
table_proto.set_gpu_num(2);
distributed::GraphTable graph_table, graph_table1;
......@@ -113,7 +113,7 @@ void testGraphSample() {
::paddle::distributed::GraphParameter table_proto1;
table_proto1.set_gpups_mode(true);
table_proto1.set_gpups_mode_shard_num(127);
table_proto1.set_shard_num(127);
table_proto1.set_gpu_num(2);
table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler");
table_proto1.set_gpups_graph_sample_args("5,5,1,1");
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import yaml
import re
import argparse
import os
########################
### Global Variables ###
########################
ops_to_fill_zero_for_empty_grads = set(list("split"))
# For API dispatch used at python-level
# { op_name : [arg_name, ...] }
core_ops_returns_info = {}
core_ops_args_info = {}
core_ops_args_type_info = {}
yaml_types_mapping = {
'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \
'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
'str' : 'std::string', \
'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
'Tensor' : 'Tensor',
'Tensor[]' : 'std::vector<Tensor>',
'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
'Scalar' : 'paddle::experimental::Scalar',
'ScalarArray' : 'paddle::experimental::ScalarArray'
}
#############################
### File Reader Helpers ###
#############################
def ReadFwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
f.close()
return contents
def ReadBwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
ret = {}
for content in contents:
if 'backward_api' in content.keys():
api_name = content['backward_api']
else:
assert False
ret[api_name] = content
f.close()
return ret
##################################
### Generic Helper Functions ###
##################################
def FindGradName(string):
return string + "_grad"
def FindForwardName(string):
if not string.endswith("_grad"):
return None
return string[:-5]
def IsPlainTensorType(string):
plain_tensor_types = ['Tensor&', 'Tensor', 'const Tensor&', 'const Tensor']
if string in plain_tensor_types:
return True
return False
def IsVectorTensorType(string):
vector_tensor_types = [
'std::vector<std::vector<Tensor>>', 'std::vector<Tensor>'
]
if string in vector_tensor_types:
return True
return False
def GetSavedName(string):
return string + "_"
def GetConstReference(string):
ret = string
if not string.startswith("const "):
ret = "const " + string
if not string.endswith("&"):
ret += "&"
return ret
def RemoveConstAndReference(string):
ret = string
if string.startswith("const "):
ret = ret[6:]
if string.endswith("&"):
ret = ret[:-1]
return ret
def GetGradNodeName(string):
return f"FinalGradNode{string}"
def GetDygraphForwardFunctionName(string):
return f"{string}_final_state_dygraph_function"
def GetIntermediateAPIFunctionName(string):
return string + "_intermediate"
def GetAutoGradMetaName(string):
return f"{string}_autograd_meta"
def GetAutoGradMetaVectorName(string):
return f"{string}_autograd_meta_vec"
def RemoveSpecialSymbolsInName(string):
# Remove any name after '@'
ret = string.split("@")[0]
return ret
def RecoverBaseNameOfInplaceFunction(function_name):
return function_name[:-1]
def GetInplacedFunctionName(function_name):
return function_name + "_"
def GetForwardFunctionName(string):
return f"{string}_final_state_dygraph_function"
######################
### Yaml Parsers ###
######################
def ParseYamlArgs(string):
# Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y
# inputs_list = [ [arg_name, arg_type, orig_position], ...]
inputs_list = []
# attrs_list = [ [arg_name, arg_type, default_value, orig_position], ...]
attrs_list = []
args = [x.strip() for x in string.strip().split(",")]
atype = r'((const )?\S+) '
aname = r'(.*)'
pattern = f'{atype}{aname}'
for i in range(len(args)):
arg = args[i]
m = re.search(pattern, arg)
arg_type = m.group(1).strip()
arg_name = m.group(3).split("=")[0].strip()
default_value = m.group(3).split("=")[1].strip() if len(
m.group(3).split("=")) > 1 else None
assert arg_type in yaml_types_mapping.keys(
), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
arg_type = yaml_types_mapping[arg_type]
arg_name = RemoveSpecialSymbolsInName(arg_name)
if "Tensor" in arg_type:
assert default_value is None
inputs_list.append([arg_name, arg_type, i])
else:
attrs_list.append([arg_name, arg_type, default_value, i])
return inputs_list, attrs_list
def ParseYamlReturns(string):
# Example0: Tensor(out), Tensor(out1)
# Example1: Tensor, Tensor
# Example2: Tensor[](out), Tensor
# list = [ [ret_name, ret_type, orig_position], ...]
returns_list = []
returns = [x.strip() for x in string.strip().split(",")]
for i in range(len(returns)):
ret = returns[i]
ret_name = ""
if "(" in ret and ")" in ret:
# Remove trailing ')'
ret = ret[:-1]
ret_type = ret.split("(")[0].strip()
ret_name = ret.split("(")[1].strip()
else:
ret_type = ret.strip()
assert ret_type in yaml_types_mapping.keys(
), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
ret_type = yaml_types_mapping[ret_type]
assert "Tensor" in ret_type
ret_name = RemoveSpecialSymbolsInName(ret_name)
returns_list.append([ret_name, ret_type, i])
return returns_list
def ParseYamlForwardFromBackward(string):
# Example: matmul (const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y) -> Tensor(out)
fname = r'(.*?)'
wspace = r'\s*'
fargs = r'(.*?)'
frets = r'(.*)'
pattern = f'{fname}{wspace}\({wspace}{fargs}{wspace}\){wspace}->{wspace}{frets}'
m = re.search(pattern, string)
function_name = m.group(1)
function_args = m.group(2)
function_returns = m.group(3)
forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args)
forward_returns_list = ParseYamlReturns(function_returns)
return forward_inputs_list, forward_attrs_list, forward_returns_list
def ParseYamlForward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
# returns Example: Tensor, Tensor
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
return inputs_list, attrs_list, returns_list
def ParseYamlBackward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false)
# returns Example: Tensor(x_grad), Tensor(y_grad)
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
return inputs_list, attrs_list, returns_list
########################
### Generator Base ###
########################
class FunctionGeneratorBase:
def __init__(self, forward_api_contents, namespace):
self.forward_api_contents = forward_api_contents
self.namespace = namespace
self.forward_api_name = ""
self.orig_forward_inputs_list = [
] #[ [arg_name, arg_type, orig_position], ...]
self.orig_forward_attrs_list = [
] #[ [attr_name, attr_type, default_value, orig_position], ...]
self.orig_forward_returns_list = [
] #[ [ret_name, ret_type, orig_position], ...]
# Processed Forward Data
self.forward_inputs_position_map = {
} #{ "name" : [type, fwd_position] }
self.forward_outputs_position_map = {
} #{ "name" : [type, fwd_position] }
# Special Op Attributes
self.optional_inputs = [] #[name, ...]
self.no_need_buffers = [] #[name, ...]
self.intermediate_outputs = [] #[name, ...]
self.inplace_map = {} #{name : name, ...}
def ParseInplaceInfo(self):
forward_api_contents = self.forward_api_contents
if 'inplace' not in forward_api_contents.keys(): return
# inplace_map_str: "(x -> out0), (y -> out2)"
inplace_map_str = forward_api_contents['inplace']
for pair in inplace_map_str.split(","):
pair = pair.strip()
if pair.startswith("("):
pair = pair[1:]
if pair.endswith(")"):
pair = pair[:-1]
key = pair.split("->")[0].strip()
val = pair.split("->")[1].strip()
self.inplace_map[key] = val
def ParseNoNeedBuffer(self):
forward_api_contents = self.forward_api_contents
if 'no_need_buffer' in forward_api_contents.keys():
no_need_buffer_str = forward_api_contents['no_need_buffer']
for name in no_need_buffer_str.split(","):
name = name.strip()
name = RemoveSpecialSymbolsInName(name)
self.no_need_buffers.append(name.strip())
def ParseDispensable(self):
forward_api_contents = self.forward_api_contents
if 'optional' in forward_api_contents.keys():
optional_inputs_str = forward_api_contents['optional']
for name in optional_inputs_str.split(","):
name = name.strip()
name = RemoveSpecialSymbolsInName(name)
self.optional_inputs.append(name)
def ParseIntermediate(self):
forward_api_contents = self.forward_api_contents
if 'intermediate' in forward_api_contents.keys():
intermediate_str = forward_api_contents['intermediate']
for name in intermediate_str.split(","):
name = name.strip()
name = RemoveSpecialSymbolsInName(name)
self.intermediate_outputs.append(name)
def CollectOriginalForwardInfo(self):
forward_api_contents = self.forward_api_contents
self.forward_api_name = forward_api_contents['api']
forward_args_str = forward_api_contents['args']
forward_returns_str = forward_api_contents['output']
assert 'api' in forward_api_contents.keys(
), "Unable to find \"api\" in forward_api_contents keys"
assert 'args' in forward_api_contents.keys(
), "Unable to find \"args\" in forward_api_contents keys"
assert 'output' in forward_api_contents.keys(
), "Unable to find \"output\" in forward_api_contents keys"
# Collect Original Forward Inputs/Outputs and then perform validation checks
self.orig_forward_inputs_list, self.orig_forward_attrs_list, self.orig_forward_returns_list = ParseYamlForward(
forward_args_str, forward_returns_str)
def DetermineForwardPositionMap(self, forward_inputs_list,
forward_returns_list):
for i in range(len(forward_inputs_list)):
forward_input = forward_inputs_list[i]
input_name = forward_input[0]
input_type = forward_input[1]
input_pos = forward_input[2]
self.forward_inputs_position_map[
input_name] = [input_type, input_pos]
for i in range(len(forward_returns_list)):
forward_return = forward_returns_list[i]
return_name = forward_return[0]
return_type = forward_return[1]
return_pos = forward_return[2]
self.forward_outputs_position_map[
return_name] = [return_type, return_pos]
print("Generated Forward Input Position Map: ",
self.forward_inputs_position_map)
print("Generated Forward Output Position Map: ",
self.forward_outputs_position_map)
class YamlGeneratorBase:
def __init__(self, api_yaml_path):
self.namespace = ""
self.api_yaml_path = api_yaml_path
self.forward_api_list = []
def ParseForwardYamlContents(self):
api_yaml_path = self.api_yaml_path
self.forward_api_list = ReadFwdFile(api_yaml_path)
def InferNameSpace(self):
api_yaml_path = self.api_yaml_path
if "sparse" in api_yaml_path:
self.namespace = "sparse::"
......@@ -16,31 +16,25 @@ import yaml
import re
import argparse
import os
ops_to_fill_zero_for_empty_grads = set(list("split"))
# For API dispatch used at python-level
# { op_name : [arg_name, ...] }
core_ops_returns_info = {}
core_ops_args_info = {}
core_ops_args_type_info = {}
namespace = ""
yaml_types_mapping = {
'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \
'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
'str' : 'std::string', \
'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
'Tensor' : 'Tensor',
'Tensor[]' : 'std::vector<Tensor>',
'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
'Scalar' : 'paddle::experimental::Scalar',
'ScalarArray' : 'paddle::experimental::ScalarArray'
}
from codegen_utils import core_ops_returns_info, core_ops_args_info, core_ops_args_type_info
from codegen_utils import yaml_types_mapping
from codegen_utils import ReadFwdFile, ReadBwdFile
from codegen_utils import FindGradName, FindForwardName, GetSavedName, GetGradNodeName
from codegen_utils import IsPlainTensorType, IsVectorTensorType
from codegen_utils import GetConstReference, RemoveConstAndReference
from codegen_utils import GetDygraphForwardFunctionName, GetIntermediateAPIFunctionName
from codegen_utils import GetAutoGradMetaName, GetAutoGradMetaVectorName
from codegen_utils import RemoveSpecialSymbolsInName, RecoverBaseNameOfInplaceFunction
from codegen_utils import GetInplacedFunctionName
from codegen_utils import ParseYamlArgs, ParseYamlReturns, ParseYamlForwardFromBackward
from codegen_utils import ParseYamlForward, ParseYamlBackward
from codegen_utils import FunctionGeneratorBase, YamlGeneratorBase
from codegen_utils import ops_to_fill_zero_for_empty_grads
###########
## Utils ##
###########
def ParseArguments():
parser = argparse.ArgumentParser(
description='Eager Code Generator Args Parser')
......@@ -55,295 +49,373 @@ def ParseArguments():
return args
#################
### Helpers ###
#################
def RecoverBaseNameOfInplaceFunction(function_name):
return function_name[:-1]
def GetInplacedFunctionName(function_name):
return function_name + "_"
def FindGradName(string):
return string + "_grad"
def FindForwardName(string):
if not string.endswith("_grad"):
return None
return string[:-5]
def IsPlainTensorType(string):
plain_tensor_types = ['Tensor&', 'Tensor', 'const Tensor&', 'const Tensor']
if string in plain_tensor_types:
return True
return False
def IsVectorTensorType(string):
vector_tensor_types = [
'std::vector<std::vector<Tensor>>', 'std::vector<Tensor>'
]
if string in vector_tensor_types:
return True
return False
def GetSavedName(string):
return string + "_"
def GetConstReference(string):
ret = string
if not string.startswith("const "):
ret = "const " + string
if not string.endswith("&"):
ret += "&"
return ret
def RemoveConstAndReference(string):
ret = string
if string.startswith("const "):
ret = ret[6:]
if string.endswith("&"):
ret = ret[:-1]
return ret
def GetGradNodeName(string):
return f"FinalGradNode{string}"
def GetForwardFunctionName(string):
return f"{string}_final_state_dygraph_function"
def GetAutoGradMetaName(string):
return f"{string}_autograd_meta"
def GetAutoGradMetaVectorName(string):
return f"{string}_autograd_meta_vec"
######################
### File Readers ###
######################
def ReadFwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
f.close()
return contents
def ReadBwdFile(filepath):
f = open(filepath, 'r')
contents = yaml.load(f, Loader=yaml.FullLoader)
ret = {}
for content in contents:
if 'backward_api' in content.keys():
api_name = content['backward_api']
else:
assert False
########################
## Code Gen Templates ##
########################
SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = \
"""
void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{
{} = egr::TensorWrapper({}, full_reserved, {});
}}
"""
ret[api_name] = content
f.close()
return ret
PLAIN_TENSOR_MEMBER_TEMPLATE = \
"""
egr::TensorWrapper {};
"""
CLEAR_TENSOR_WRAPPER_TEMPLATE = \
"""
{}.clear();
"""
######################
### Yaml Parsers ###
######################
def ParseInplaceInfo(string):
# string: "(x -> out0), (y -> out2)"
inplace_map = {}
for pair in string.split(","):
pair = pair.strip()
if pair.startswith("("):
pair = pair[1:]
SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = \
"""
void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
for(const auto& eager_tensor : {}) {{
{}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) );
}};
}}
"""
if pair.endswith(")"):
pair = pair[:-1]
VECTOR_TENSOR_MEMBER_TEMPLATE = \
"""
std::vector<egr::TensorWrapper> {};
"""
key = pair.split("->")[0].strip()
val = pair.split("->")[1].strip()
inplace_map[key] = val
CLEAR_VECTOR_TENSOR_WRAPPERS_TEMPLATE = \
"""
for (auto tw: {}) {
tw.clear();
};
"""
return inplace_map
SET_ATTR_METHOD_TEMPLATE = \
"""
void SetAttribute{}({} {}) {{
{} = {};
}}
"""
ATTRIBUTE_MEMBER_WITH_DEFAULT_TEMPLATE = \
"""
{} {} = {};
"""
def RemoveSpecialSymbolsInName(string):
# Remove any name after '@'
ret = string.split("@")[0]
return ret
ATTRIBUTE_MEMBER_TEMPLATE = \
"""
{} {};
"""
NODE_DECLARATION_TEMPLATE = \
"""
class {} : public egr::GradNodeBase {{
public:
{}() : egr::GradNodeBase() {{}}
{}(size_t bwd_in_slot_num, size_t bwd_out_slot_num) :
egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {{}}
~{}() override = default;
def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
# intermediate_outputs : [name0, name1, ...]
# forward_returns_list : [[ret_name, type, orig_pos], ...]
"""
Check whether intermediate_outputs are positioned
at the very end of forward_returns_list
"""
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }}
intermediate_positions = range(
len(forward_returns_list) - len(intermediate_outputs),
len(forward_returns_list))
for ret_name, _, pos in forward_returns_list:
if ret_name in intermediate_outputs:
assert pos in intermediate_positions
void ClearTensorWrappers() override {{
{}
is_tensor_wrappers_cleared = true;
}}
// SetTensorWrapperX, SetTensorWrapperY, ...
{}
// SetAttributes
{}
def ParseDispensable(string):
# string: "X, Y"
string = RemoveSpecialSymbolsInName(string)
return [v.strip() for v in string.split(",")]
bool IsTensorWrappersCleared() override {{
return is_tensor_wrappers_cleared;
}}
private:
// TensorWrappers
{}
bool is_tensor_wrappers_cleared = false;
def ParseIntermediate(string):
string = RemoveSpecialSymbolsInName(string)
return [v.strip() for v in string.split(",")]
// Attributes
{}
}};
"""
FUNCTION_TEMPLATE = \
"""
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
{}
auto hooked_grads = ApplyGradientHooks(grads);
def ParseNoNeedBuffer(string):
# string: "x, y"
string = RemoveSpecialSymbolsInName(string)
// Call grad_api function
VLOG(3) << \"Final State Running: \" << \"{}\";
auto grad_api_returns = {}{}({});
{}
}}
"""
no_need_buffer_set = set()
for name in string.split(","):
no_need_buffer_set.add(name.strip())
FORWARD_FUNCTION_TEMPLATE = \
"""
{} {}({}) {{
{}
return no_need_buffer_set
{}
// Returns
return {};
}}
def ParseYamlArgs(string):
# Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y
"""
# inputs_list = [ [arg_name, arg_type, orig_position], ...]
inputs_list = []
# attrs_list = [ [arg_name, arg_type, default_value, orig_position], ...]
attrs_list = []
NODE_CREATION_TEMPLATE = \
"""
// Get AutoGradMeta
{}
bool trace_backward = egr::Controller::Instance().HasGrad();
bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({});
{}
// Forward API Call
{}
{}
{{
{}
{}
if(require_any_grad) {{
egr::EagerUtils::PassStopGradient({});
args = [x.strip() for x in string.strip().split(",")]
atype = r'((const )?\S+) '
aname = r'(.*)'
pattern = f'{atype}{aname}'
for i in range(len(args)):
arg = args[i]
m = re.search(pattern, arg)
arg_type = m.group(1).strip()
arg_name = m.group(3).split("=")[0].strip()
default_value = m.group(3).split("=")[1].strip() if len(
m.group(3).split("=")) > 1 else None
// Node Construction
{}
// SetAttributes
{}
// SetTensorWrappers
{}
// SetGradOutMeta & SetEdges
{}
{}
// SetOutRank & SetHistory & SetGradInMeta & RetainGrad
{}
{}
{}
{}
}}
}}
"""
assert arg_type in yaml_types_mapping.keys(
), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
arg_type = yaml_types_mapping[arg_type]
NAMESPACE_WRAPPER_TEMPLATE = \
"""
namespace {} {{
{}
}}
"""
arg_name = RemoveSpecialSymbolsInName(arg_name)
if "Tensor" in arg_type:
assert default_value is None
inputs_list.append([arg_name, arg_type, i])
else:
attrs_list.append([arg_name, arg_type, default_value, i])
NODE_CC_FILE_TEMPLATE = \
"""
#include "glog/logging.h"
#include "paddle/phi/api/all.h"
#include "paddle/phi/api/backward/backward_api.h"
#include "paddle/phi/api/backward/sparse_bw_api.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
return inputs_list, attrs_list
#include "paddle/phi/api/include/sparse_api.h"
{}
"""
def ParseYamlReturns(string):
# Example0: Tensor(out), Tensor(out1)
# Example1: Tensor, Tensor
# Example2: Tensor[](out), Tensor
NODE_H_FILE_TEMPLATE = \
"""
#pragma once
#include "paddle/fluid/eager/tensor_wrapper.h"
#include "paddle/fluid/eager/grad_node_info.h"
# list = [ [ret_name, ret_type, orig_position], ...]
returns_list = []
{}
"""
returns = [x.strip() for x in string.strip().split(",")]
FORWARD_CC_FILE_TEMPLATE = \
"""
#include "paddle/phi/api/lib/dygraph_api.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
for i in range(len(returns)):
ret = returns[i]
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
ret_name = ""
if "(" in ret and ")" in ret:
# Remove trailing ')'
ret = ret[:-1]
ret_type = ret.split("(")[0].strip()
ret_name = ret.split("(")[1].strip()
else:
ret_type = ret.strip()
{}
{}
"""
assert ret_type in yaml_types_mapping.keys(
), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
ret_type = yaml_types_mapping[ret_type]
FORWARD_H_FILE_TEMPLATE = \
"""
#pragma once
#include "glog/logging.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/phi/api/all.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/eager/to_static/run_program_op_func.h"
assert "Tensor" in ret_type
ret_name = RemoveSpecialSymbolsInName(ret_name)
returns_list.append([ret_name, ret_type, i])
{}
{}
"""
return returns_list
CORE_OPS_INFO_TEMPLATE = \
"""
std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info = {{
{}
}};
std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_type_info = {{
{}
}};
std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_returns_info = {{
{}
}};
"""
def ParseYamlForwardFromBackward(string):
# Example: matmul (const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y) -> Tensor(out)
CORE_OPS_DECLARATION_TEMPLATE = \
"""
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info;
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_type_info;
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_returns_info;
fname = r'(.*?)'
wspace = r'\s*'
fargs = r'(.*?)'
frets = r'(.*)'
pattern = f'{fname}{wspace}\({wspace}{fargs}{wspace}\){wspace}->{wspace}{frets}'
"""
m = re.search(pattern, string)
function_name = m.group(1)
function_args = m.group(2)
function_returns = m.group(3)
CHECK_INPLACE_TEMPLATE = \
"""
// Check Inplace
egr::EagerUtils::CheckInplace({}, {}, require_any_grad);\n
"""
forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args)
forward_returns_list = ParseYamlReturns(function_returns)
BUMP_INPLACE_VERSION_TEMPLATE = \
"""
// Bump Inplace Version
{}.bump_inplace_version();
VLOG(3) << \"Tensor(\" << {}.name() << \") uses Inplace Strategy.\";\n
"""
return forward_inputs_list, forward_attrs_list, forward_returns_list
#######################
## Generator Helpers ##
#######################
def GenerateCoreOpInfoDeclaration():
return CORE_OPS_DECLARATION_TEMPLATE
def ParseYamlForward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
# returns Example: Tensor, Tensor
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
def GenerateCoreOpInfoDefinition():
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
op_args_info_list = []
for op_name, arg_list in core_ops_args_info.items():
arg_str = ",".join(["\"" + v + "\"" for v in arg_list])
op_args_info = f"{{ \"{op_name}\", {{ {arg_str} }} }},"
op_args_info_list.append(op_args_info)
return inputs_list, attrs_list, returns_list
op_types_info_list = []
for op_name, type_list in core_ops_args_type_info.items():
type_str = ",".join(["\"" + v + "\"" for v in type_list])
op_types_info = f"{{ \"{op_name}\", {{ {type_str} }} }},"
op_types_info_list.append(op_types_info)
op_returns_info_list = []
for op_name, return_list in core_ops_returns_info.items():
return_str = ",".join(["\"" + v + "\"" for v in return_list])
return_types_info = f"{{ \"{op_name}\", {{ {return_str} }} }},"
op_returns_info_list.append(return_types_info)
def ParseYamlBackward(args_str, returns_str):
# args Example: (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false)
# returns Example: Tensor(x_grad), Tensor(y_grad)
op_args_info_str = "\n".join(op_args_info_list)
op_types_info_str = "\n".join(op_types_info_list)
op_returns_info_str = "\n".join(op_returns_info_list)
fargs = r'(.*?)'
wspace = r'\s*'
args_pattern = f'\({fargs}\)'
args_str = re.search(args_pattern, args_str).group(1)
core_ops_info_definition_str = CORE_OPS_INFO_TEMPLATE.format(
op_args_info_str, op_types_info_str, op_returns_info_str)
inputs_list, attrs_list = ParseYamlArgs(args_str)
returns_list = ParseYamlReturns(returns_str)
return core_ops_info_definition_str
return inputs_list, attrs_list, returns_list
#####################
## Generator Class ##
#####################
class DygraphSingleFunctionGenerator(FunctionGeneratorBase):
def __init__(self, forward_api_contents, grad_api_contents, namespace):
self.forward_api_contents = forward_api_contents
# Members from Parent:
#self.namespace
#self.forward_api_contents
#self.forward_api_name
#self.orig_forward_inputs_list
#self.orig_forward_attrs_list
#self.orig_forward_returns_list
#self.forward_inputs_position_map
#self.forward_outputs_position_map
#self.optional_inputs
#self.no_need_buffers
#self.intermediate_outputs
#self.inplace_map
FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
self.grad_api_contents = grad_api_contents
# Raw Contents
self.backward_forward_str = ""
self.backward_api_name = ""
self.forward_attrs_list = [
] #[ [attr_name, attr_type, default_value, orig_position], ...]
self.forward_inputs_list = [
] #[ [arg_name, arg_type, orig_position], ...]
self.forward_returns_list = [
] #[ [ret_name, ret_type, orig_position], ...]
self.backward_inputs_list = [
] #[ [attr_name, attr_type, default_value, orig_position], ...]
self.backward_attrs_list = [
] #[ [arg_name, arg_type, orig_position], ...]
self.backward_returns_list = [
] #[ [ret_name, ret_type, orig_position], ...]
# SlotNameMatched Backward Data
self.backward_forward_inputs_map = {
} #{ "name" : [type, is_fwd_input, orig_position] ...}
self.backward_grad_inputs_map = {
} #{ "name" : [type, fwd_position, orig_position] ...}
self.backward_grad_outputs_map = {
} #{ "name" : [type, fwd_position, orig_position] ...}
# Generated Results
self.forward_definition_str = ""
self.forward_declaration_str = ""
self.node_declaration_str = ""
self.node_definition_str = ""
def DygraphYamlValidationCheck(self):
forward_api_contents = self.forward_api_contents
grad_api_contents = self.grad_api_contents
assert 'api' in forward_api_contents.keys()
assert 'args' in forward_api_contents.keys()
assert 'output' in forward_api_contents.keys()
assert 'backward' in forward_api_contents.keys()
assert 'args' in grad_api_contents.keys()
assert 'output' in grad_api_contents.keys()
assert 'forward' in grad_api_contents.keys()
def ForwardsValidationCheck(self):
forward_inputs_list = self.forward_inputs_list
forward_attrs_list = self.forward_attrs_list
forward_returns_list = self.forward_returns_list
orig_forward_inputs_list = self.orig_forward_inputs_list
orig_forward_attrs_list = self.orig_forward_attrs_list
orig_forward_returns_list = self.orig_forward_returns_list
#######################
### Preprocessing ###
#######################
def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
forward_returns_list, orig_forward_inputs_list,
orig_forward_attrs_list, orig_forward_returns_list):
for i in range(len(forward_inputs_list)):
forward_input_name = forward_inputs_list[i][0]
forward_input_type = forward_inputs_list[i][1]
......@@ -387,17 +459,18 @@ def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
assert pos > max_input_position
max_attr_position = max(max_attr_position, pos)
def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
backward_attrs_list):
def BackwardValidationCheck(self):
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_grad_inputs_map = self.backward_grad_inputs_map
backward_attrs_list = self.backward_attrs_list
# Check Order: TensorWrappers, GradTensors, Attributes
max_fwd_input_position = -1
for _, (_, _, pos) in backward_fwd_input_map.items():
for _, (_, _, pos) in backward_forward_inputs_map.items():
max_fwd_input_position = max(max_fwd_input_position, pos)
max_grad_tensor_position = -1
for _, (_, _, pos) in backward_grad_input_map.items():
for _, (_, _, pos) in backward_grad_inputs_map.items():
assert pos > max_fwd_input_position
max_grad_tensor_position = max(max_grad_tensor_position, pos)
......@@ -406,35 +479,48 @@ def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
assert pos > max_grad_tensor_position
max_attr_position = max(max_attr_position, pos)
def IntermediateValidationCheck(self):
intermediate_outputs = self.intermediate_outputs
forward_returns_list = self.forward_returns_list
"""
Check whether intermediate_outputs are positioned
at the very end of forward_returns_list
"""
intermediate_positions = range(
len(forward_returns_list) - len(intermediate_outputs),
len(forward_returns_list))
for ret_name, _, pos in forward_returns_list:
if ret_name in intermediate_outputs:
assert pos in intermediate_positions
def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list):
forward_inputs_position_map = {}
forward_outputs_position_map = {}
for i in range(len(forward_inputs_list)):
forward_input = forward_inputs_list[i]
input_name = forward_input[0]
input_type = forward_input[1]
input_pos = forward_input[2]
def CollectBackwardInfo(self):
forward_api_contents = self.forward_api_contents
grad_api_contents = self.grad_api_contents
forward_inputs_position_map[input_name] = [input_type, input_pos]
self.backward_api_name = forward_api_contents['backward']
self.backward_forward_str = grad_api_contents['forward']
for i in range(len(forward_returns_list)):
forward_return = forward_returns_list[i]
return_name = forward_return[0]
return_type = forward_return[1]
return_pos = forward_return[2]
backward_args_str = grad_api_contents['args']
backward_returns_str = grad_api_contents['output']
forward_outputs_position_map[return_name] = [return_type, return_pos]
self.backward_inputs_list, self.backward_attrs_list, self.backward_returns_list = ParseYamlBackward(
backward_args_str, backward_returns_str)
print("Parsed Backward Inputs List: ", self.backward_inputs_list)
print("Prased Backward Attrs List: ", self.backward_attrs_list)
print("Parsed Backward Returns List: ", self.backward_returns_list)
return forward_inputs_position_map, forward_outputs_position_map
def CollectForwardInfoFromBackwardContents(self):
backward_forward_str = self.backward_forward_str
def SlotNameMatching(backward_inputs_list, backward_returns_list,
forward_inputs_position_map, forward_outputs_position_map):
self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForwardFromBackward(
backward_forward_str)
backward_fwd_input_map = {}
backward_grad_input_map = {}
backward_grad_output_map = {}
def SlotNameMatching(self):
backward_inputs_list = self.backward_inputs_list
backward_returns_list = self.backward_returns_list
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
for backward_input in backward_inputs_list:
backward_input_name = backward_input[0]
......@@ -450,7 +536,7 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,
matched_forward_output_pos = forward_outputs_position_map[
backward_fwd_name][1]
backward_grad_input_map[backward_input_name] = [
self.backward_grad_inputs_map[backward_input_name] = [
backward_input_type, matched_forward_output_pos,
backward_input_pos
]
......@@ -459,14 +545,14 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,
if backward_input_name in forward_inputs_position_map.keys():
tensor_wrapper_type = forward_inputs_position_map[
backward_input_name][0]
backward_fwd_input_map[backward_input_name] = [
self.backward_forward_inputs_map[backward_input_name] = [
backward_input_type, True, backward_input_pos
]
elif backward_input_name in forward_outputs_position_map.keys():
tensor_wrapper_type = forward_outputs_position_map[
backward_input_name][0]
backward_fwd_input_map[backward_input_name] = [
self.backward_forward_inputs_map[backward_input_name] = [
backward_input_type, False, backward_input_pos
]
else:
......@@ -480,191 +566,113 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,
backward_fwd_name = FindForwardName(backward_output_name)
assert backward_fwd_name is not None
assert backward_fwd_name in forward_inputs_position_map.keys(
), backward_fwd_name
), f"Unable to find {backward_fwd_name} in forward inputs"
matched_forward_input_type = forward_inputs_position_map[
backward_fwd_name][0]
matched_forward_input_pos = forward_inputs_position_map[
backward_fwd_name][1]
backward_grad_output_map[backward_output_name] = [
backward_output_type, matched_forward_input_pos, backward_output_pos
self.backward_grad_outputs_map[backward_output_name] = [
backward_output_type, matched_forward_input_pos,
backward_output_pos
]
print("Generated Backward Fwd Input Map: ",
self.backward_forward_inputs_map)
print("Generated Backward Grad Input Map: ",
self.backward_grad_inputs_map)
print("Generated Backward Grad Output Map: ",
self.backward_grad_outputs_map)
return backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map
def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
backward_attrs_list, no_need_buffer_set):
# Inputs:
# fwd_api_name = ""
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# Determine Node Name
forward_op_name = fwd_api_name
def GenerateNodeDeclaration(self):
forward_op_name = self.forward_api_name
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_attrs_list = self.backward_attrs_list
no_need_buffers = self.no_need_buffers
# SetTensorWrapper Methods & TensorWrapper Members
set_tensor_wrapper_methods_str = ""
tensor_wrapper_members_str = ""
clear_tensor_wrapper_str = ""
for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
if tname in no_need_buffer_set:
no_need_buffer = "true"
else:
no_need_buffer = "false"
for tname, (ttype, is_fwd_input,
_) in backward_forward_inputs_map.items():
no_need_buffer = "true" if tname in no_need_buffers else "false"
tensor_wrapper_name = GetSavedName(tname)
if IsPlainTensorType(ttype):
SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """
void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{
{} = egr::TensorWrapper({}, full_reserved, {});
}}
"""
set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tensor_wrapper_name, tname, no_need_buffer)
PLAIN_TENSOR_MEMBER_TEMPLATE = """
egr::TensorWrapper {};
"""
tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
{}.clear();
"""
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPER_TEMPLATE.format(
tensor_wrapper_name)
else:
assert IsVectorTensorType(ttype)
SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
for(const auto& eager_tensor : {}) {{
{}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) );
}};
}}
"""
set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tname, tensor_wrapper_name, no_need_buffer)
VECTOR_TENSOR_MEMBER_TEMPLATE = """
std::vector<egr::TensorWrapper> {};
"""
tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
for (auto tw: {}) {
tw.clear();
};
"""
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
clear_tensor_wrapper_str += CLEAR_VECTOR_TENSOR_WRAPPERS_TEMPLATE.format(
tensor_wrapper_name)
# End: SetTensorWrapper Methods & TensorWrapper Members
# SetAttributes & Attribute Members
set_attribute_methods_str = ""
attribute_members_str = ""
for aname, atype, default_val, _ in backward_attrs_list:
saved_attr_name = GetSavedName(aname)
SET_ATTR_METHOD_TEMPLATE = """
void SetAttribute{}({} {}) {{
{} = {};
}}
"""
set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format(
aname, GetConstReference(atype), aname, saved_attr_name, aname)
if default_val:
ATTRIBUTE_MEMBER_TEMPLATE = """
{} {} = {};
"""
attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
RemoveConstAndReference(atype), saved_attr_name, default_val)
attribute_members_str += ATTRIBUTE_MEMBER_WITH_DEFAULT_TEMPLATE.format(
RemoveConstAndReference(atype), saved_attr_name,
default_val)
else:
ATTRIBUTE_MEMBER_TEMPLATE = """
{} {};
"""
attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
RemoveConstAndReference(atype), saved_attr_name)
# End: SetAttributes & Attribute Members
grad_node_name = GetGradNodeName(fwd_api_name)
NODE_DECLARATION_TEMPLATE = """
class {} : public egr::GradNodeBase {{
public:
{}() : egr::GradNodeBase() {{}}
{}(size_t bwd_in_slot_num, size_t bwd_out_slot_num) :
egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {{}}
~{}() override = default;
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }}
void ClearTensorWrappers() override {{
{}
is_tensor_wrappers_cleared = true;
}}
// SetTensorWrapperX, SetTensorWrapperY, ...
{}
// SetAttributes
{}
bool IsTensorWrappersCleared() override {{
return is_tensor_wrappers_cleared;
}}
private:
// TensorWrappers
{}
bool is_tensor_wrappers_cleared = false;
// Attributes
{}
}};
"""
node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name = GetGradNodeName(forward_op_name)
self.node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_node_name, grad_node_name,
grad_node_name, clear_tensor_wrapper_str,
set_tensor_wrapper_methods_str, set_attribute_methods_str,
tensor_wrapper_members_str, attribute_members_str)
return node_declaration_str
print("Generated Node Declaration: ", self.node_declaration_str)
def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map,
backward_attrs_list):
# fwd_api_name = ""
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
def GenerateNodeDefinition(self):
namespace = self.namespace
forward_api_name = self.forward_api_name
backward_api_name = self.backward_api_name
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_grad_inputs_map = self.backward_grad_inputs_map
backward_grad_outputs_map = self.backward_grad_outputs_map
backward_attrs_list = self.backward_attrs_list
# Construct grad_api function args
# Order: TensorWrappers, GradTensors, Attributes
grad_api_args_len = len(backward_fwd_input_map.keys()) + len(
backward_grad_input_map.keys()) + len(backward_attrs_list)
grad_api_args_len = len(backward_forward_inputs_map.keys()) + len(
backward_grad_inputs_map.keys()) + len(backward_attrs_list)
grad_api_args = ["" for i in range(grad_api_args_len)]
for name, (_, is_fwd_input,
grad_api_position), in backward_fwd_input_map.items():
grad_api_position), in backward_forward_inputs_map.items():
tensor_wrapper_name = GetSavedName(name)
grad_api_args[
grad_api_position] = f"egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, nullptr)"
for _, (ttype, fwd_position,
grad_api_position) in backward_grad_input_map.items():
grad_api_position) in backward_grad_inputs_map.items():
if IsPlainTensorType(ttype):
grad_api_args[
grad_api_position] = f"hooked_grads[{fwd_position}][0]"
else:
assert IsVectorTensorType(ttype)
grad_api_args[grad_api_position] = f"hooked_grads[{fwd_position}]"
grad_api_args[
grad_api_position] = f"hooked_grads[{fwd_position}]"
for name, _, _, grad_api_position in backward_attrs_list:
saved_attribute_name = GetSavedName(name)
......@@ -672,10 +680,10 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
grad_api_args_str = ", ".join(grad_api_args)
# Construct grad_api returns
num_bwd_outputs = len(backward_grad_output_map.keys())
num_bwd_outputs = len(backward_grad_outputs_map.keys())
returns_str = f"std::vector<std::vector<paddle::experimental::Tensor>> returns({num_bwd_outputs});\n"
for _, (ttype, fwd_position,
grad_api_position) in backward_grad_output_map.items():
grad_api_position) in backward_grad_outputs_map.items():
# Infer Grad API Return Type
if num_bwd_outputs == 1:
# Single tensor output, return as is
......@@ -690,50 +698,135 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
returns_str += f"return returns;\n"
grad_node_name = GetGradNodeName(fwd_api_name)
grad_node_name = GetGradNodeName(forward_api_name)
fill_zero_str = ""
if fwd_api_name in ops_to_fill_zero_for_empty_grads:
if forward_api_name in ops_to_fill_zero_for_empty_grads:
fill_zero_str = "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n"
if len(namespace) > 0:
grad_api_namespace = f"paddle::experimental::{namespace}"
self.node_definition_str = FUNCTION_TEMPLATE.format(
grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace,
backward_api_name, grad_api_args_str, returns_str)
print("Generated Node Definition: ", self.node_definition_str)
def GenerateForwardDefinition(self, is_inplaced):
namespace = self.namespace
forward_api_name = GetInplacedFunctionName(
self.forward_api_name) if is_inplaced else self.forward_api_name
backward_api_name = self.backward_api_name
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
forward_attrs_list = self.forward_attrs_list
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_grad_inputs_map = self.backward_grad_inputs_map
backward_grad_outputs_map = self.backward_grad_outputs_map
backward_attrs_list = self.backward_attrs_list
optional_inputs = self.optional_inputs
intermediate_outputs = self.intermediate_outputs
inplace_map = self.inplace_map
# Get Function Args
num_inputs = len(forward_attrs_list) + len(
forward_inputs_position_map.keys())
inputs_args_definition_list = ["" for i in range(num_inputs)]
inputs_args_declaration_list = ["" for i in range(num_inputs)]
inputs_call_list = ["" for i in range(num_inputs)]
for name, (ttype, pos) in forward_inputs_position_map.items():
inputs_call_list[pos] = f"{name}"
is_optional = (name in optional_inputs)
if IsPlainTensorType(ttype):
if is_optional:
arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
else:
if inplace_map and name in inplace_map.keys():
arg_str = f"paddle::experimental::Tensor& {name}"
else:
arg_str = f"const paddle::experimental::Tensor& {name}"
else:
assert IsVectorTensorType(ttype)
arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
inputs_args_definition_list[pos] = arg_str
inputs_args_declaration_list[pos] = arg_str
for name, atype, default_val, pos in forward_attrs_list:
inputs_call_list[pos] = name
if default_val is not None:
inputs_args_declaration_list[
pos] = f"{atype} {name} = {default_val}"
else:
inputs_args_declaration_list[pos] = f"{atype} {name}"
inputs_args_definition_list[pos] = f"{atype} {name}"
inputs_args_declaration_str = ", ".join(inputs_args_declaration_list)
inputs_args_definition_str = ", ".join(inputs_args_definition_list)
inputs_call_args_str = ", ".join(inputs_call_list)
# Forward Full Logic
function_name = forward_api_name
if len(intermediate_outputs) > 0:
function_name = GetIntermediateAPIFunctionName(function_name)
forward_call_str = f"auto api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});"
# Get return type list & outputs
num_outputs = len(forward_outputs_position_map.keys()) - len(
intermediate_outputs)
returns_type_list = ["" for i in range(num_outputs)]
returns_list = ["" for i in range(num_outputs)]
for name, (rtype, pos) in forward_outputs_position_map.items():
if name in intermediate_outputs:
continue
if num_outputs == 1:
returns_list[0] = f"api_result"
else:
# Tuple api_result
returns_list[pos] = f"std::get<{pos}>(api_result)"
if IsPlainTensorType(rtype):
returns_type_list[pos] = "paddle::experimental::Tensor"
else:
assert IsVectorTensorType(rtype)
returns_type_list[
pos] = "std::vector<paddle::experimental::Tensor>"
if num_outputs == 1:
returns_str = returns_list[0]
returns_type_str = returns_type_list[0]
else:
grad_api_namespace = f"paddle::experimental"
returns_type_str = ", ".join(returns_type_list)
returns_type_str = f"std::tuple<{returns_type_str}>"
returns_str = ", ".join(returns_list)
returns_str = f"std::make_tuple({returns_str})"
FUNCTION_TEMPLATE = """
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
{}
auto hooked_grads = ApplyGradientHooks(grads);
self.GenerateNodeCreationCodes(forward_call_str)
// Call grad_api function
VLOG(3) << \"Final State Running: \" << \"{}\";
auto grad_api_returns = {}::{}({});
{}
}}
"""
node_creation_str = self.node_creation_str
dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{forward_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
forward_function_name = GetDygraphForwardFunctionName(forward_api_name)
node_definition_str = FUNCTION_TEMPLATE.format(
grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace,
bwd_api_name, grad_api_args_str, returns_str)
return node_definition_str
def GenerateNodeCreationCodes(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list, forward_call_str,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
inplace_map):
# fwd_api_name = ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
# forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format(
returns_type_str, forward_function_name, inputs_args_definition_str,
dygraph_event_str, node_creation_str, returns_str)
self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n"
print("Generated Forward Definition: ", self.forward_definition_str)
print("Generated Forward Declaration: ", self.forward_declaration_str)
def GenerateNodeCreationCodes(self, forward_call_str):
forward_api_name = self.forward_api_name
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
forward_attrs_list = self.forward_attrs_list
backward_forward_inputs_map = self.backward_forward_inputs_map
backward_grad_inputs_map = self.backward_grad_inputs_map
backward_grad_outputs_map = self.backward_grad_outputs_map
backward_attrs_list = self.backward_attrs_list
optional_inputs = self.optional_inputs
inplace_map = self.inplace_map
# Get Input AutoGradMeta
inputs_autograd_meta_list = []
......@@ -788,24 +881,17 @@ def GenerateNodeCreationCodes(
bump_inplace_version_str = ""
for inplace_name in inplace_map.keys():
inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
check_inplace_str += f"""
// Check Inplace
egr::EagerUtils::CheckInplace({inplace_name}, {inplace_autograd_meta_name}, require_any_grad);\n
"""
bump_inplace_version_str += f"""
// Bump Inplace Version
{inplace_name}.bump_inplace_version();
VLOG(3) << \"Tensor(\" << {inplace_name}.name() << \") uses Inplace Strategy.\";\n
"""
check_inplace_str += CHECK_INPLACE_TEMPLATE.format(
inplace_name, inplace_autograd_meta_name)
bump_inplace_version_str += BUMP_INPLACE_VERSION_TEMPLATE.format(
inplace_name, inplace_name)
# Node Construction
num_bwd_inputs = len(backward_grad_input_map.keys())
num_bwd_outputs = len(backward_grad_output_map.keys())
grad_node_name = GetGradNodeName(
RecoverBaseNameOfInplaceFunction(
fwd_api_name)) if inplace_map else GetGradNodeName(fwd_api_name)
node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
num_backward_inputs = len(backward_grad_inputs_map.keys())
num_backward_outputs = len(backward_grad_outputs_map.keys())
grad_node_name = GetGradNodeName(forward_api_name)
node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_backward_inputs}, {num_backward_outputs});"
# SetAttributes
set_attributes_list = []
......@@ -823,7 +909,8 @@ def GenerateNodeCreationCodes(
# SetTensorWrappers
set_tensor_wrappers_list = []
for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items():
for name, (atype, is_fwd_input,
pos) in backward_forward_inputs_map.items():
is_optional = (name in optional_inputs)
if is_fwd_input:
......@@ -869,13 +956,13 @@ def GenerateNodeCreationCodes(
output_autograd_meta_name = GetAutoGradMetaName(name)
set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
if num_outputs == 1:
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);"
set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});"
else:
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
set_out_rank_list.append(set_out_rank)
set_history_list.append(set_history)
set_grad_in_meta_list.append(set_grad_in_meta)
......@@ -886,324 +973,230 @@ def GenerateNodeCreationCodes(
set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
set_retain_grad_str = "\n".join(set_retain_grad_list)
node_event_name = fwd_api_name + " node_creation"
NODE_CREATION_TEMPLATE = """
paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
"""
node_creation_event_str = NODE_CREATION_TEMPLATE.format(node_event_name)
NODE_CREATION_TEMPLATE = """
// Get AutoGradMeta
{}
bool trace_backward = egr::Controller::Instance().HasGrad();
bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({});
{}
// Forward API Call
{}
{}
{{
{}
{}
if(require_any_grad) {{
egr::EagerUtils::PassStopGradient({});
// Node Construction
{}
// SetAttributes
{}
// SetTensorWrappers
{}
// SetGradOutMeta & SetEdges
{}
{}
// SetOutRank & SetHistory & SetGradInMeta & RetainGrad
{}
{}
{}
{}
}}
}}
node_event_name = forward_api_name + " node_creation"
node_creation_event_str = f"paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::Operator, 1);\n"
"""
node_creation_str = NODE_CREATION_TEMPLATE.format(
self.node_creation_str = NODE_CREATION_TEMPLATE.format(
inputs_autograd_meta_str, compute_require_grad_args_str,
check_inplace_str, forward_call_str, bump_inplace_version_str,
node_creation_event_str, outputs_autograd_meta_str,
pass_stop_gradient_args_str, node_construction_str, set_attributes_str,
set_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str,
set_out_rank_str, set_history_str, set_grad_in_meta_str,
set_retain_grad_str)
return node_creation_str
def GenerateForwardDefinition(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
intermediate_outputs, inplace_map):
# fwd_api_name = ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
# forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# optional_inputs = ["name0", ...]
pass_stop_gradient_args_str, node_construction_str,
set_attributes_str, set_tensor_wrappers_str, set_grad_out_meta_str,
set_edges_str, set_out_rank_str, set_history_str,
set_grad_in_meta_str, set_retain_grad_str)
# Get Function Args
num_inputs = len(forward_attrs_list) + len(forward_inputs_position_map.keys(
))
inputs_args_definition_list = ["" for i in range(num_inputs)]
inputs_args_declaration_list = ["" for i in range(num_inputs)]
inputs_call_list = ["" for i in range(num_inputs)]
def GenerateInplacedForwardDygraphFunctions(self):
# Inplaced Version Dygraph Function Generation
forward_api_name = self.forward_api_name
forward_api_contents = self.forward_api_contents
if forward_api_name != "sum" and "inplace" in forward_api_contents.keys(
):
# Node Definition Generation
self.GenerateForwardDefinition(is_inplaced=True)
self.UpdateCoreOpsInformation(is_inplaced=True)
def UpdateCoreOpsInformation(self, is_inplaced):
forward_api_name = GetInplacedFunctionName(
self.forward_api_name) if is_inplaced else self.forward_api_name
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
forward_attrs_list = self.forward_attrs_list
num_args = len(forward_inputs_position_map.keys()) + len(
forward_attrs_list)
num_returns = len(forward_outputs_position_map.keys())
final_state_fwd_api_name = "final_state_" + forward_api_name
core_ops_returns_info[
final_state_fwd_api_name] = ["" for i in range(num_returns)]
core_ops_args_info[
final_state_fwd_api_name] = ["" for i in range(num_args)]
core_ops_args_type_info[
final_state_fwd_api_name] = ["" for i in range(num_args)]
for name, (ttype, pos) in forward_inputs_position_map.items():
inputs_call_list[pos] = f"{name}"
is_optional = (name in optional_inputs)
core_ops_args_info[final_state_fwd_api_name][pos] = name
if IsPlainTensorType(ttype):
if is_optional:
arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
else:
if inplace_map and name in inplace_map.keys():
arg_str = f"paddle::experimental::Tensor& {name}"
else:
arg_str = f"const paddle::experimental::Tensor& {name}"
core_ops_args_type_info[final_state_fwd_api_name][
pos] = "tensor"
else:
assert IsVectorTensorType(ttype)
arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
core_ops_args_type_info[final_state_fwd_api_name][pos] = "list"
inputs_args_definition_list[pos] = arg_str
inputs_args_declaration_list[pos] = arg_str
for name, _, _, pos in forward_attrs_list:
core_ops_args_info[final_state_fwd_api_name][pos] = name
for name, atype, default_val, pos in forward_attrs_list:
inputs_call_list[pos] = name
if default_val is not None:
inputs_args_declaration_list[
pos] = f"{atype} {name} = {default_val}"
else:
inputs_args_declaration_list[pos] = f"{atype} {name}"
inputs_args_definition_list[pos] = f"{atype} {name}"
for name, (ttype, pos) in forward_outputs_position_map.items():
core_ops_returns_info[final_state_fwd_api_name][pos] = name
inputs_args_declaration_str = ", ".join(inputs_args_declaration_list)
inputs_args_definition_str = ", ".join(inputs_args_definition_list)
inputs_call_args_str = ", ".join(inputs_call_list)
def run(self):
# Basic Validation Check
self.DygraphYamlValidationCheck()
# Forward Full Logic
if len(intermediate_outputs) == 0:
function_name = fwd_api_name
else:
function_name = fwd_api_name + "_intermediate"
##########################
## Parsing Raw Contents ##
##########################
# Parse inplace_map
self.ParseInplaceInfo()
if len(namespace) > 0:
forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});"
else:
forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
# Parse no_need_buffer
self.ParseNoNeedBuffer()
# Get return type list & outputs
num_outputs = len(forward_outputs_position_map.keys()) - len(
intermediate_outputs)
returns_type_list = ["" for i in range(num_outputs)]
returns_list = ["" for i in range(num_outputs)]
for name, (rtype, pos) in forward_outputs_position_map.items():
if name in intermediate_outputs:
continue
if num_outputs == 1:
returns_list[0] = f"api_result"
else:
# Tuple api_result
returns_list[pos] = f"std::get<{pos}>(api_result)"
# Parse optional_inputs
self.ParseDispensable()
if IsPlainTensorType(rtype):
returns_type_list[pos] = "paddle::experimental::Tensor"
else:
assert IsVectorTensorType(rtype)
returns_type_list[pos] = "std::vector<paddle::experimental::Tensor>"
# Parse intermediate_outputs
self.ParseIntermediate()
self.IntermediateValidationCheck()
if num_outputs == 1:
returns_str = returns_list[0]
returns_type_str = returns_type_list[0]
else:
returns_type_str = ", ".join(returns_type_list)
returns_type_str = f"std::tuple<{returns_type_str}>"
returns_str = ", ".join(returns_list)
returns_str = f"std::make_tuple({returns_str})"
# Initialize backward_forward_str, backward_inputs_list, backward_attrs_list, backward_returns_list
self.CollectBackwardInfo()
node_creation_str = GenerateNodeCreationCodes(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list, forward_call_str,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
inplace_map)
# Initialize forward_inputs_list, forward_attrs_list, forward_returns_list
self.CollectForwardInfoFromBackwardContents()
dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
# Initialize orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list
self.CollectOriginalForwardInfo()
FORWARD_FUNCTION_TEMPLATE = """
{} {}({}) {{
{}
# Forwards Validation Check
self.ForwardsValidationCheck()
{}
#############################
## Process Parsed Contents ##
#############################
# Initialize forward_inputs_position_map, forward_outputs_position_map
self.DetermineForwardPositionMap(self.forward_inputs_list,
self.forward_returns_list)
// Returns
return {};
}}
"""
# Initialize forward_inputs_position_map, forward_outputs_position_map
self.SlotNameMatching()
forward_function_name = GetForwardFunctionName(fwd_api_name)
forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
returns_type_str, forward_function_name, inputs_args_definition_str,
dygraph_event_str, node_creation_str, returns_str)
forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});"
# Backward Validation Check
self.BackwardValidationCheck()
return forward_function_str, forward_function_declaration_str
#####################
## Code Generation ##
#####################
self.GenerateNodeDeclaration()
self.GenerateNodeDefinition()
self.GenerateForwardDefinition(is_inplaced=False)
self.UpdateCoreOpsInformation(is_inplaced=False)
def CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list):
# fwd_api_name : ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
# forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list)
num_returns = len(forward_outputs_position_map.keys())
self.GenerateInplacedForwardDygraphFunctions()
final_state_fwd_api_name = "final_state_" + fwd_api_name
core_ops_returns_info[
final_state_fwd_api_name] = ["" for i in range(num_returns)]
core_ops_args_info[final_state_fwd_api_name] = ["" for i in range(num_args)]
core_ops_args_type_info[
final_state_fwd_api_name] = ["" for i in range(num_args)]
for name, (ttype, pos) in forward_inputs_position_map.items():
core_ops_args_info[final_state_fwd_api_name][pos] = name
if IsPlainTensorType(ttype):
core_ops_args_type_info[final_state_fwd_api_name][pos] = "tensor"
else:
assert IsVectorTensorType(ttype)
core_ops_args_type_info[final_state_fwd_api_name][pos] = "list"
for name, _, _, pos in forward_attrs_list:
core_ops_args_info[final_state_fwd_api_name][pos] = name
class DygraphYamlGenerator(YamlGeneratorBase):
def __init__(self, api_yaml_path, backward_yaml_path):
# Parent members:
# self.namespace
# self.api_yaml_path
# self.forward_api_list
YamlGeneratorBase.__init__(self, api_yaml_path)
for name, (ttype, pos) in forward_outputs_position_map.items():
core_ops_returns_info[final_state_fwd_api_name][pos] = name
self.backward_yaml_path = backward_yaml_path
self.grad_api_dict = {}
self.forward_definition_str = ""
self.forward_declaration_str = ""
self.node_declaration_str = ""
self.node_definition_str = ""
def GenerateCoreOpInfoDeclaration():
core_ops_declaration_str = """
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info;
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_type_info;
extern std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_returns_info;
def ParseYamlContents(self):
self.ParseForwardYamlContents()
"""
return core_ops_declaration_str
backward_yaml_path = self.backward_yaml_path
self.grad_api_dict = ReadBwdFile(backward_yaml_path)
def GetBackwardAPIContents(self, forward_api_contents):
grad_api_dict = self.grad_api_dict
def GenerateCoreOpInfoDefinition():
if 'backward' not in forward_api_contents.keys(): return None
CORE_OPS_INFO_TEMPLATE = """
std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_info = {{
{}
}};
std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_args_type_info = {{
{}
}};
std::unordered_map<std::string, std::vector<std::string>> core_ops_final_state_returns_info = {{
{}
}};
backward_api_name = forward_api_contents['backward']
assert backward_api_name in grad_api_dict.keys()
backward_api_contents = grad_api_dict[backward_api_name]
"""
op_args_info_list = []
for op_name, arg_list in core_ops_args_info.items():
arg_str = ",".join(["\"" + v + "\"" for v in arg_list])
op_args_info = f"{{ \"{op_name}\", {{ {arg_str} }} }},"
op_args_info_list.append(op_args_info)
return backward_api_contents
op_types_info_list = []
for op_name, type_list in core_ops_args_type_info.items():
type_str = ",".join(["\"" + v + "\"" for v in type_list])
op_types_info = f"{{ \"{op_name}\", {{ {type_str} }} }},"
op_types_info_list.append(op_types_info)
def GenerateCode(self):
forward_api_list = self.forward_api_list
grad_api_dict = self.grad_api_dict
namespace = self.namespace
op_returns_info_list = []
for op_name, return_list in core_ops_returns_info.items():
return_str = ",".join(["\"" + v + "\"" for v in return_list])
return_types_info = f"{{ \"{op_name}\", {{ {return_str} }} }},"
op_returns_info_list.append(return_types_info)
for forward_api_contents in forward_api_list:
backward_api_contents = self.GetBackwardAPIContents(
forward_api_contents)
if backward_api_contents is None: continue
op_args_info_str = "\n".join(op_args_info_list)
op_types_info_str = "\n".join(op_types_info_list)
op_returns_info_str = "\n".join(op_returns_info_list)
d_generator = DygraphSingleFunctionGenerator(
forward_api_contents, backward_api_contents, namespace)
d_generator.run()
core_ops_info_definition_str = CORE_OPS_INFO_TEMPLATE.format(
op_args_info_str, op_types_info_str, op_returns_info_str)
self.forward_definition_str += d_generator.forward_definition_str + "\n"
self.forward_declaration_str += d_generator.forward_declaration_str + "\n"
self.node_declaration_str += d_generator.node_declaration_str + "\n"
self.node_definition_str += d_generator.node_definition_str + "\n"
return core_ops_info_definition_str
if len(namespace) > 0:
if namespace.endswith("::"):
namespace = namespace[:-2]
self.forward_definition_str = NAMESPACE_WRAPPER_TEMPLATE.format(
namespace, self.forward_definition_str)
self.forward_declaration_str = NAMESPACE_WRAPPER_TEMPLATE.format(
namespace, self.forward_declaration_str)
self.node_declaration_str = NAMESPACE_WRAPPER_TEMPLATE.format(
namespace, self.node_declaration_str)
self.node_definition_str = NAMESPACE_WRAPPER_TEMPLATE.format(
namespace, self.node_definition_str)
def run(self):
self.ParseYamlContents()
self.InferNameSpace()
self.GenerateCode()
##################
## File Writers ##
##################
def GenerateNodeCCFile(filepath, node_definition_str):
file_contents = """
#include "glog/logging.h"
#include "paddle/phi/api/all.h"
#include "paddle/phi/api/backward/backward_api.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
if os.path.exists(filepath):
os.remove(filepath)
#include "paddle/phi/api/backward/sparse_bw_api.h"
"""
file_contents += node_definition_str
file_contents = NODE_CC_FILE_TEMPLATE.format(node_definition_str)
with open(filepath, 'a') as f:
f.write(file_contents)
def GenerateNodeHFile(filepath, node_declaration_str):
file_contents = """
#pragma once
#include "paddle/fluid/eager/tensor_wrapper.h"
#include "paddle/fluid/eager/grad_node_info.h"
if os.path.exists(filepath):
os.remove(filepath)
"""
file_contents += node_declaration_str
file_contents = NODE_H_FILE_TEMPLATE.format(node_declaration_str)
with open(filepath, 'a') as f:
f.write(file_contents)
def GenerateForwardCCFile(filepath, forward_definition_str):
file_contents = """
#include "paddle/phi/api/lib/dygraph_api.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
"""
if os.path.exists(filepath):
os.remove(filepath)
file_contents += GenerateCoreOpInfoDefinition()
file_contents += forward_definition_str
core_ops_info_str = GenerateCoreOpInfoDefinition()
file_contents = FORWARD_CC_FILE_TEMPLATE.format(core_ops_info_str,
forward_definition_str)
with open(filepath, 'a') as f:
f.write(file_contents)
def GenerateForwardHFile(filepath, forward_function_declaration_str):
file_contents = """
#pragma once
#include "glog/logging.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/phi/api/all.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/eager/to_static/run_program_op_func.h"
if os.path.exists(filepath):
os.remove(filepath)
"""
file_contents += GenerateCoreOpInfoDeclaration()
file_contents += forward_function_declaration_str
core_ops_info_str = GenerateCoreOpInfoDeclaration()
file_contents = FORWARD_H_FILE_TEMPLATE.format(
core_ops_info_str, forward_function_declaration_str)
with open(filepath, 'a') as f:
f.write(file_contents)
......@@ -1224,199 +1217,13 @@ if __name__ == "__main__":
api_yaml_path = api_yaml_paths[i]
backward_yaml_path = backward_yaml_paths[i]
if "sparse" in api_yaml_path:
assert "sparse" in backward_yaml_path
namespace = "sparse"
else:
namespace = ""
fwd_api_list = ReadFwdFile(api_yaml_path)
grad_api_dict = ReadBwdFile(backward_yaml_path)
yaml_forward_definition_str = ""
yaml_forward_declaration_str = ""
yaml_node_declaration_str = ""
yaml_node_definition_str = ""
for fwd_api in fwd_api_list:
# We only generate Ops with grad
if 'backward' not in fwd_api.keys():
continue
assert 'api' in fwd_api.keys()
assert 'args' in fwd_api.keys()
assert 'output' in fwd_api.keys()
assert 'backward' in fwd_api.keys()
no_need_buffer_set = set()
if 'no_need_buffer' in fwd_api.keys():
no_need_buffer_set = ParseNoNeedBuffer(fwd_api[
'no_need_buffer'])
fwd_api_name = fwd_api['api']
fwd_args_str = fwd_api['args']
fwd_returns_str = fwd_api['output']
inplace_map = {}
if 'inplace' in fwd_api.keys():
inplace_map = ParseInplaceInfo(fwd_api['inplace'])
bwd_api_name = fwd_api['backward']
assert bwd_api_name in grad_api_dict.keys(), bwd_api_name
bwd_api = grad_api_dict[bwd_api_name]
assert 'args' in bwd_api.keys()
assert 'output' in bwd_api.keys()
assert 'forward' in bwd_api.keys()
# Parse Dispensable Inputs
optional_inputs = []
if 'optional' in fwd_api.keys():
optional_inputs = ParseDispensable(fwd_api['optional'])
bwd_forward_str = bwd_api['forward']
bwd_args_str = bwd_api['args']
bwd_returns_str = bwd_api['output']
# Collect Forward Inputs/Outputs
forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
bwd_forward_str)
print("Parsed Forward Inputs List: ", forward_inputs_list)
print("Prased Forward Attrs List: ", forward_attrs_list)
print("Parsed Forward Returns List: ", forward_returns_list)
intermediate_outputs = []
if 'intermediate' in fwd_api.keys():
intermediate_outputs = ParseIntermediate(fwd_api[
'intermediate'])
IntermediateValidationCheck(intermediate_outputs,
forward_returns_list)
# Collect Original Forward Inputs/Outputs and then perform validation checks
orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
fwd_args_str, fwd_returns_str)
print("Parsed Original Forward Inputs List: ",
orig_forward_inputs_list)
print("Prased Original Forward Attrs List: ",
orig_forward_attrs_list)
print("Parsed Original Forward Returns List: ",
orig_forward_returns_list)
# Forward Validation Checks
ForwardsValidationCheck(
forward_inputs_list, forward_attrs_list, forward_returns_list,
orig_forward_inputs_list, orig_forward_attrs_list,
orig_forward_returns_list)
# Parse Backward Inputs/Outputs
backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
bwd_args_str, bwd_returns_str)
print("Parsed Backward Inputs List: ", backward_inputs_list)
print("Prased Backward Attrs List: ", backward_attrs_list)
print("Parsed Backward Returns List: ", backward_returns_list)
# Determine Forward Inputs/Outputs Position
forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
forward_inputs_list, forward_returns_list)
print("Generated Forward Input Position Map: ",
forward_inputs_position_map)
print("Generated Forward Output Position Map: ",
forward_outputs_position_map)
# SlotName Matching
backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
backward_inputs_list, backward_returns_list,
forward_inputs_position_map, forward_outputs_position_map)
print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
print("Generated Backward Grad Input Map: ",
backward_grad_input_map)
print("Generated Backward Grad Output Map: ",
backward_grad_output_map)
# Backward Validation Check
BackwardValidationCheck(backward_fwd_input_map,
backward_grad_input_map,
backward_attrs_list)
# Node Declaration Generation
yaml_node_declaration_str += GenerateNodeDeclaration(
fwd_api_name, backward_fwd_input_map, backward_attrs_list,
no_need_buffer_set)
print("Generated Node Declaration: ", node_declaration_str)
yaml_node_definition_str += GenerateNodeDefinition(
fwd_api_name, bwd_api_name, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map,
backward_attrs_list)
print("Generated Node Definition: ", node_definition_str)
# Node Definition Generation
definition_declaration_pair = GenerateForwardDefinition(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, orig_forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
intermediate_outputs, {})
print("Generated Forward Definition: ", forward_definition_str)
print("Generated Forward Declaration: ", forward_declaration_str)
yaml_forward_definition_str += definition_declaration_pair[0]
yaml_forward_declaration_str += definition_declaration_pair[1]
# For python-level API dispatch
CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
forward_outputs_position_map,
orig_forward_attrs_list)
# Inplaced Version Dygraph Function Generation
if fwd_api_name != "sum" and "inplace" in fwd_api.keys():
fwd_api_name_inplaced = GetInplacedFunctionName(fwd_api_name)
# Node Definition Generation
definition_declaration_pair = GenerateForwardDefinition(
fwd_api_name_inplaced, bwd_api_name,
forward_inputs_position_map, forward_outputs_position_map,
forward_attrs_list, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map,
backward_attrs_list, optional_inputs, intermediate_outputs,
inplace_map)
print("Generated Inplaced Forward Definition: ",
forward_definition_str)
print("Generated Inplaced Forward Declaration: ",
forward_declaration_str)
forward_definition_str += definition_declaration_pair[0]
forward_declaration_str += definition_declaration_pair[1]
# For python-level API dispatch
CollectCoreOpsInformation(
fwd_api_name_inplaced, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list)
if len(namespace) > 0:
forward_definition_str += f"""namespace {namespace} {{
{yaml_forward_definition_str}
}}
"""
forward_declaration_str += f"""namespace {namespace} {{
{yaml_forward_declaration_str}
}}
"""
node_declaration_str += f"""namespace {namespace} {{
{yaml_node_declaration_str}
}}
"""
node_definition_str += f"""namespace {namespace} {{
{yaml_node_definition_str}
}}
"""
generator = DygraphYamlGenerator(api_yaml_path, backward_yaml_path)
generator.run()
else:
forward_definition_str += yaml_forward_definition_str
forward_declaration_str += yaml_forward_declaration_str
node_declaration_str += yaml_node_declaration_str
node_definition_str += yaml_node_definition_str
node_declaration_str += generator.node_declaration_str + "\n"
node_definition_str += generator.node_definition_str + "\n"
forward_definition_str += generator.forward_definition_str + "\n"
forward_declaration_str += generator.forward_declaration_str + "\n"
# Generate Files
nodes_h_path = args.nodes_h_path
......@@ -1424,12 +1231,6 @@ if __name__ == "__main__":
forwards_h_path = args.forwards_h_path
forwards_cc_path = args.forwards_cc_path
for path in [
nodes_cc_path, nodes_h_path, forwards_h_path, forwards_cc_path
]:
if os.path.exists(path):
os.remove(path)
GenerateNodeCCFile(nodes_cc_path, node_definition_str)
GenerateNodeHFile(nodes_h_path, node_declaration_str)
GenerateForwardCCFile(forwards_cc_path, forward_definition_str)
......
......@@ -15,7 +15,10 @@
import os
import argparse
import logging
from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap, GetInplacedFunctionName, ParseInplaceInfo
from codegen_utils import FunctionGeneratorBase, YamlGeneratorBase
from codegen_utils import yaml_types_mapping
from codegen_utils import ReadFwdFile, IsVectorTensorType, GetForwardFunctionName
from codegen_utils import ParseYamlForward, GetInplacedFunctionName
###########################
## Global Configurations ##
......@@ -121,7 +124,10 @@ FUNCTION_NAME_TEMPLATE = \
PYTHON_C_FUNCTION_REG_TEMPLATE = \
"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}"
"""
{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}
"""
PYTHON_C_WRAPPER_TEMPLATE = \
......@@ -229,77 +235,39 @@ NAMESPACE_WRAPPER_TEMPLATE = \
#######################
## Generator Classes ##
#######################
class PythonCSingleFunctionGenerator:
def __init__(self, fwd_api_contents, namespace):
self.fwd_api_contents = fwd_api_contents
self.namespace = namespace
# Raw Contents
self.forward_api_name = ""
self.forward_args_str = ""
self.forward_returns_str = ""
# Raw Data
self.forward_attrs_list = None #[ [attr_name, attr_type, default_value, orig_position], ...]
self.forward_inputs_list = None #[ [arg_name, arg_type, orig_position], ...]
self.forward_returns_list = None #[ [ret_name, ret_type, orig_position], ...]
# Processed Data
self.forward_inputs_position_map = None #{ "name" : [type, fwd_position] }
self.forward_outputs_position_map = None #{ "name" : [type, fwd_position] }
# Special Op Attributes
self.optional_inputs = [] #[name, ...]
class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
def __init__(self, forward_api_contents, namespace):
# Members from Parent:
#self.namespace
#self.forward_api_contents
#self.forward_api_name
#self.orig_forward_inputs_list
#self.orig_forward_attrs_list
#self.orig_forward_returns_list
#self.forward_inputs_position_map
#self.forward_outputs_position_map
#self.optional_inputs
#self.no_need_buffers
#self.intermediate_outputs
#self.inplace_map
FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
self.is_forward_only = True
# Generated Results
self.python_c_function_str = ""
self.python_c_function_reg_str = ""
def CollectRawContents(self):
fwd_api_contents = self.fwd_api_contents
assert 'api' in fwd_api_contents.keys(
), "Unable to find \"api\" in fwd_api_contents keys"
assert 'args' in fwd_api_contents.keys(
), "Unable to find \"args\" in fwd_api_contents keys"
assert 'output' in fwd_api_contents.keys(
), "Unable to find \"output\" in fwd_api_contents keys"
self.forward_api_name = fwd_api_contents['api']
self.forward_args_str = fwd_api_contents['args']
self.forward_returns_str = fwd_api_contents['output']
def CollectIsForwardOnly(self):
fwd_api_contents = self.fwd_api_contents
self.is_forward_only = False if 'backward' in fwd_api_contents.keys(
forward_api_contents = self.forward_api_contents
self.is_forward_only = False if 'backward' in forward_api_contents.keys(
) else True
def CollectOptionalInputs(self):
fwd_api_contents = self.fwd_api_contents
if 'optional' in fwd_api_contents.keys():
self.optional_inputs = ParseDispensable(fwd_api_contents[
'optional'])
def CollectForwardInOutAttr(self):
forward_args_str = self.forward_args_str
forward_returns_str = self.forward_returns_str
self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward(
forward_args_str, forward_returns_str)
def CollectForwardPositionMap(self):
forward_inputs_list = self.forward_inputs_list
forward_returns_list = self.forward_returns_list
self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap(
forward_inputs_list, forward_returns_list)
def GeneratePythonCFunction(self, inplace_map):
def GeneratePythonCFunction(self):
namespace = self.namespace
forward_api_name = GetInplacedFunctionName(
self.forward_api_name) if inplace_map else self.forward_api_name
forward_attrs_list = self.forward_attrs_list
inplace_map = self.inplace_map
forward_api_name = self.forward_api_name
orig_forward_attrs_list = self.orig_forward_attrs_list
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
optional_inputs = self.optional_inputs
......@@ -326,7 +294,7 @@ class PythonCSingleFunctionGenerator:
parse_attributes_str = ""
# Generate Python-C Attributes Parsing Logic
for name, atype, _, pos in forward_attrs_list:
for name, atype, _, pos in orig_forward_attrs_list:
parsing_function_name = FindParsingFunctionFromAttributeType(atype)
parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format(
name, pos, atype, name, parsing_function_name, name,
......@@ -334,11 +302,11 @@ class PythonCSingleFunctionGenerator:
# Generate Dygraph Function Call Logic
num_args = len(forward_inputs_position_map.keys()) + len(
forward_attrs_list)
orig_forward_attrs_list)
dygraph_function_call_list = ["" for i in range(num_args)]
for name, (_, pos) in forward_inputs_position_map.items():
dygraph_function_call_list[pos] = f"{name}"
for name, _, _, pos in forward_attrs_list:
for name, _, _, pos in orig_forward_attrs_list:
dygraph_function_call_list[pos] = f"{name}"
dygraph_function_call_str = ",".join(dygraph_function_call_list)
......@@ -350,16 +318,6 @@ class PythonCSingleFunctionGenerator:
fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
"::", namespace, GetForwardFunctionName(forward_api_name))
if inplace_map:
assert len(
inplace_map
) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
for inplace_input, inplace_output in inplace_map.items():
return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
forward_api_name, inplace_input, forward_api_name,
inplace_output)
break
else:
return_str = " return ToPyObject(out);"
# Generate Record Event for performance profiling
......@@ -374,29 +332,56 @@ class PythonCSingleFunctionGenerator:
self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
forward_api_name, namespace, forward_api_name, forward_api_name)
def run(self, inplace_map):
if len(inplace_map) > 0:
inplaced_forward_api_name = GetInplacedFunctionName(
self.forward_api_name)
assert len(
inplace_map
) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
for inplace_input, inplace_output in inplace_map.items():
return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
inplaced_forward_api_name, inplace_input,
inplaced_forward_api_name, inplace_output)
break
self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format(
inplaced_forward_api_name, pythonc_record_event_str,
inplaced_forward_api_name, get_eager_tensor_str,
parse_attributes_str, fwd_function_name,
dygraph_function_call_str, return_str)
# Generate Python-C Function Registration
self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format(
inplaced_forward_api_name, namespace, inplaced_forward_api_name,
inplaced_forward_api_name)
def run(self):
# Initialized is_forward_only
self.CollectIsForwardOnly()
# Initialized forward_api_name, forward_args_str, forward_returns_str
self.CollectRawContents()
if SkipAPIGeneration(self.forward_api_name): return False
# Initialized optional_inputs
self.CollectOptionalInputs()
self.ParseDispensable()
# Initialized inplace_map
self.ParseInplaceInfo()
# Initialized forward_inputs_list, forward_returns_list, forward_attrs_list
self.CollectForwardInOutAttr()
# Initialized orig_forward_inputs_list, orig_forward_returns_list, orig_forward_attrs_list
self.CollectOriginalForwardInfo()
logging.info(
f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}")
f"Parsed Original Forward Inputs List: \n{self.orig_forward_inputs_list}"
)
logging.info(
f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}")
f"Prased Original Forward Attrs List: \n{self.orig_forward_attrs_list}"
)
logging.info(
f"Parsed Original Forward Returns List: \n{self.forward_returns_list}"
f"Parsed Original Forward Returns List: \n{self.orig_forward_returns_list}"
)
if SkipAPIGeneration(self.forward_api_name): return False
# Initialized forward_inputs_position_map, forward_outputs_position_map
self.CollectForwardPositionMap()
self.DetermineForwardPositionMap(self.orig_forward_inputs_list,
self.orig_forward_returns_list)
logging.info(
f"Generated Forward Input Position Map: {self.forward_inputs_position_map}"
)
......@@ -405,7 +390,7 @@ class PythonCSingleFunctionGenerator:
)
# Code Generation
self.GeneratePythonCFunction(inplace_map)
self.GeneratePythonCFunction()
logging.info(
f"Generated Python-C Function: {self.python_c_function_str}")
logging.info(
......@@ -415,21 +400,18 @@ class PythonCSingleFunctionGenerator:
return True
class PythonCYamlGenerator:
class PythonCYamlGenerator(YamlGeneratorBase):
def __init__(self, path):
self.yaml_path = path
self.namespace = ""
self.forward_api_list = []
# Parent members:
# self.namespace
# self.api_yaml_path
# self.forward_api_list
YamlGeneratorBase.__init__(self, api_yaml_path)
# Generated Result
self.python_c_functions_reg_str = ""
self.python_c_functions_str = ""
def ParseYamlContents(self):
yaml_path = self.yaml_path
self.forward_api_list = ReadFwdFile(yaml_path)
def GeneratePythonCFunctions(self):
namespace = self.namespace
forward_api_list = self.forward_api_list
......@@ -437,28 +419,12 @@ class PythonCYamlGenerator:
for forward_api_content in forward_api_list:
f_generator = PythonCSingleFunctionGenerator(forward_api_content,
namespace)
status = f_generator.run({})
status = f_generator.run()
if status == True:
self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
self.python_c_functions_str += f_generator.python_c_function_str + "\n"
if 'inplace' in forward_api_content.keys():
inplace_map = ParseInplaceInfo(forward_api_content['inplace'])
f_generator_inplace = PythonCSingleFunctionGenerator(
forward_api_content, namespace)
status = f_generator_inplace.run(inplace_map)
if status == True:
self.python_c_functions_reg_str += f_generator_inplace.python_c_function_reg_str + ",\n"
self.python_c_functions_str += f_generator_inplace.python_c_function_str + "\n"
def InferNameSpace(self):
yaml_path = self.yaml_path
if "sparse" in yaml_path:
self.namespace = "sparse::"
def AttachNamespace(self):
namespace = self.namespace
python_c_functions_str = self.python_c_functions_str
......@@ -474,7 +440,7 @@ class PythonCYamlGenerator:
self.InferNameSpace()
# Read Yaml file
self.ParseYamlContents()
self.ParseForwardYamlContents()
# Code Generation
self.GeneratePythonCFunctions()
......
......@@ -51,8 +51,7 @@ static std::vector<std::string> GetTensorsName(
}
static void CheckInputVarStatus(const Tensor &tensor) {
PADDLE_ENFORCE_EQ(
tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true,
PADDLE_ENFORCE_EQ(tensor.defined() && tensor.is_dense_tensor(), true,
paddle::platform::errors::InvalidArgument(
"The input tensor %s of "
"RunProgram(Grad)Op holds "
......@@ -74,7 +73,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
paddle::platform::errors::InvalidArgument(
"dst_tensor shall be defined."));
if (phi::DenseTensor::classof(dst_tensor.impl().get())) {
if (dst_tensor.is_dense_tensor()) {
auto &src_tensor = src_var.Get<phi::DenseTensor>();
PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true,
paddle::platform::errors::InvalidArgument(
......@@ -88,7 +87,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
"RunProgram(Grad)Op's internal "
"scope is not initialized.",
name));
} else if (phi::SelectedRows::classof(dst_tensor.impl().get())) {
} else if (dst_tensor.is_selected_rows()) {
auto &src_tensor = src_var.Get<phi::SelectedRows>();
PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true,
paddle::platform::errors::InvalidArgument(
......@@ -159,9 +158,6 @@ static void ShareTensorsFromScope(
name));
CheckOutputVarStatus(*var, *tensors[i]);
// share tensor
// TODO(dev): Determine Tensor type by scope.var
// auto tensor_base = tensors[i]->impl();
// if (phi::DenseTensor::classof(tensor_base.get())) {
if (var->IsType<phi::DenseTensor>()) {
auto &src_tensor = var->Get<phi::DenseTensor>();
auto *dst_tensor = const_cast<phi::DenseTensor *>(
......@@ -169,7 +165,6 @@ static void ShareTensorsFromScope(
VLOG(2) << "share " << name << " from scope";
*dst_tensor = src_tensor;
} else if (var->IsType<phi::SelectedRows>()) {
// } else if (phi::SelectedRows::classof(tensor_base.get())) {
auto &src_tensor = var->Get<phi::SelectedRows>();
auto *dst_tensor = const_cast<phi::SelectedRows *>(
dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
......@@ -202,7 +197,6 @@ inline void RunProgramAPI(
"The OutScope of RunProgramGradOp should only hold one scope."));
// Step 2. prepare executor and init persistable variables
// NOTE(Aurelius84): While training some models, forward can be called many
// times and then apply backpropagation all at once, such as Reinforcement
// Learning. Tensor data in multi-step training should be saved into single
......@@ -277,11 +271,6 @@ inline void RunProgramGradAPI(
// if all output vars are set to stop_gradient, grad op no need to executed
if (x_grad.empty() && params_grad.empty()) return;
// TODO(dev): Remove this line hard code. And need to deal with the out_grad
// name problem.
// const_cast<paddle::experimental::Tensor &>(out_grad[0])
// .set_name("matmul_v2_0.tmp_0@GRAD");
auto *global_block =
BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
......@@ -381,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
VLOG(3) << "out_grads[0].size() : " << grads[0].size();
std::vector<paddle::experimental::Tensor> x_grad;
std::vector<paddle::experimental::Tensor> params_grad;
ConstructGradTensors(x_, &x_grad);
ConstructGradTensors(params_, &params_grad);
ConstructXGradTensors(x_, &x_grad);
ConstructParamGradTensors(params_, &params_grad);
std::vector<paddle::experimental::Tensor *> x_grad_ptr;
std::vector<paddle::experimental::Tensor *> params_grad_ptr;
for (auto &i : x_grad) {
......@@ -392,9 +381,6 @@ class GradNodeRunProgram : public egr::GradNodeBase {
params_grad_ptr.emplace_back(&i);
}
// auto x_grad_ptr = ConstructGradTensors(x_);
// auto params_grad_ptr = ConstructGradTensors(params_);
PADDLE_ENFORCE_EQ(
grads[0].size(), fwd_out_names_.size(),
paddle::platform::errors::InvalidArgument(
......@@ -412,7 +398,6 @@ class GradNodeRunProgram : public egr::GradNodeBase {
params_grad_ptr);
VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
return {x_grad, params_grad};
// return {x_grad, details::DereferenceTensors(params_grad_ptr)};
}
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
......@@ -447,29 +432,35 @@ class GradNodeRunProgram : public egr::GradNodeBase {
}
protected:
void ConstructGradTensors(
const std::vector<paddle::experimental::Tensor> &fwd_tensors,
std::vector<paddle::experimental::Tensor> *grad_tensors) {
void ConstructXGradTensors(
const std::vector<paddle::experimental::Tensor> &x,
std::vector<paddle::experimental::Tensor> *x_grad) {
// TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
// such as: name, tensor type(DenseTensor or SelectedRows).
VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
for (auto &fwd_t : fwd_tensors) {
if (phi::DenseTensor::classof(fwd_t.impl().get())) {
grad_tensors->emplace_back(std::make_shared<phi::DenseTensor>());
} else if (phi::SelectedRows::classof(fwd_t.impl().get())) {
grad_tensors->emplace_back(std::make_shared<phi::SelectedRows>());
}
auto &grad_t = grad_tensors->back();
grad_t.set_name(fwd_t.name() + "@GRAD");
}
}
void ConstructGradTensors(
const std::vector<paddle::experimental::Tensor> &fwd_tensors) {
VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
for (auto &fwd_t : fwd_tensors) {
auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad();
grad_tesnor.set_name(fwd_t.name() + "@GRAD");
for (auto &t : x) {
if (t.is_dense_tensor()) {
x_grad->emplace_back(std::make_shared<phi::DenseTensor>());
} else if (t.is_selected_rows()) {
x_grad->emplace_back(std::make_shared<phi::SelectedRows>());
}
x_grad->back().set_name(t.name() + "@GRAD");
}
}
void ConstructParamGradTensors(
const std::vector<paddle::experimental::Tensor> &param,
std::vector<paddle::experimental::Tensor> *param_grad) {
for (auto &t : param) {
auto t_meta = egr::EagerUtils::unsafe_autograd_meta(t);
auto t_grad = egr::EagerUtils::unsafe_autograd_meta(t)->Grad();
if (t_meta->StopGradient()) {
param_grad->emplace_back();
} else if (t_grad.is_dense_tensor()) {
param_grad->emplace_back(std::make_shared<phi::DenseTensor>());
} else if (t_grad.is_selected_rows()) {
param_grad->emplace_back(std::make_shared<phi::SelectedRows>());
}
param_grad->back().set_name(t.name() + "@GRAD");
}
}
......
......@@ -271,6 +271,7 @@ void EagerUtils::GetOutput(const std::shared_ptr<EagerVariable>& out,
"shared_ptr, this error may indicate some outputs "
"are nullptr"));
out_var->set_impl(out->GetTensorBase());
out_var->set_name(out->name());
}
void EagerUtils::GetOutputs(
......
......@@ -13,6 +13,9 @@ IF(WITH_GPU)
nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps)
#nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS graph_gpu_ps)
# ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu)
# target_link_libraries(test_sample_rate graph_gpu_ps)
ENDIF()
IF(WITH_ROCM)
hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
......
......@@ -93,14 +93,17 @@ node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
struct NeighborSampleResult {
int64_t *val;
int *actual_sample_size, sample_size, key_size;
int *offset;
NeighborSampleResult(int _sample_size, int _key_size)
: sample_size(_sample_size), key_size(_key_size) {
actual_sample_size = NULL;
val = NULL;
offset = NULL;
};
~NeighborSampleResult() {
if (val != NULL) cudaFree(val);
if (actual_sample_size != NULL) cudaFree(actual_sample_size);
if (offset != NULL) cudaFree(offset);
}
};
......
......@@ -71,10 +71,10 @@ TEST(TEST_FLEET, graph_sample) {
*/
::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(true);
table_proto.set_gpups_mode_shard_num(127);
table_proto.set_shard_num(127);
table_proto.set_gpu_num(3);
table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
table_proto.set_gpups_graph_sample_args("5,5,1,1");
table_proto.set_gpups_graph_sample_args("100,5,5,1,1");
prepare_file(edge_file_name, edges);
g.init_cpu_table(table_proto);
g.load(std::string(edge_file_name), std::string("e>"));
......@@ -93,16 +93,53 @@ TEST(TEST_FLEET, graph_sample) {
cudaMalloc((void **)&key, 3 * sizeof(int64_t));
cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
int64_t *res = new int64_t[9];
cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
int64_t *res = new int64_t[7];
/*
cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost);
std::sort(res, res + 3);
std::sort(res + 6, res + 9);
int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
for (int i = 0; i < 9; i++) {
std::sort(res + 4, res + 7);
//int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
int64_t expected_sample_val[] = {28, 29, 30, 0, 21, 22, 23};
for (int i = 0; i < 7; i++) {
VLOG(0)<<i<<" "<<res[i];
if (expected_sample_val[i] != -1) {
ASSERT_EQ(res[i], expected_sample_val[i]);
}
}
delete[] res;
delete neighbor_sample_res;
*/
cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost);
int *actual_sample_size = new int[3];
cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size, 12,
cudaMemcpyDeviceToHost); // 3, 1, 3
int *cumsum_sample_size = new int[3];
cudaMemcpy(cumsum_sample_size, neighbor_sample_res->offset, 12,
cudaMemcpyDeviceToHost); // 0, 3, 4
std::vector<std::vector<int64_t>> neighbors_;
std::vector<int64_t> neighbors_7 = {28, 29, 30, 31, 32, 33, 34, 35};
std::vector<int64_t> neighbors_0 = {0};
std::vector<int64_t> neighbors_6 = {21, 22, 23, 24, 25, 26, 27};
neighbors_.push_back(neighbors_7);
neighbors_.push_back(neighbors_0);
neighbors_.push_back(neighbors_6);
for (int i = 0; i < 3; i++) {
for (int j = cumsum_sample_size[i];
j < cumsum_sample_size[i] + actual_sample_size[i]; j++) {
bool flag = false;
for (int k = 0; k < neighbors_[i].size(); k++) {
if (res[j] == neighbors_[i][k]) {
flag = true;
break;
}
}
ASSERT_EQ(flag, true);
}
}
delete[] res;
delete[] actual_sample_size;
delete[] cumsum_sample_size;
delete neighbor_sample_res;
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <unistd.h>
#include <condition_variable> // NOLINT
#include <fstream>
#include <iomanip>
#include <string>
#include <thread> // NOLINT
#include <unordered_set>
#include <vector>
#include "google/protobuf/text_format.h"
#include <chrono>
#include "gtest/gtest.h"
#include "paddle/fluid/distributed/ps.pb.h"
#include "paddle/fluid/distributed/ps/service/env.h"
#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
using namespace paddle::framework;
namespace platform = paddle::platform;
namespace operators = paddle::operators;
namespace memory = paddle::memory;
namespace distributed = paddle::distributed;
std::string input_file;
int fixed_key_size = 100, sample_size = 100,
bfs_sample_nodes_in_each_shard = 10000, init_search_size = 1,
bfs_sample_edges = 20;
std::vector<std::string> edges = {
std::string("37\t45\t0.34"), std::string("37\t145\t0.31"),
std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
std::string("59\t45\t0.34"), std::string("59\t145\t0.31"),
std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
// odd id:96 48 122 112
char edge_file_name[] = "test_edges.txt";
void prepare_file(char file_name[], std::vector<std::string> data) {
std::ofstream ofile;
ofile.open(file_name);
for (auto x : data) {
ofile << x << std::endl;
}
ofile.close();
}
void testSampleRate() {
#ifdef PADDLE_WITH_HETERPS
std::vector<int64_t> ids;
int start = 0;
pthread_rwlock_t rwlock;
pthread_rwlock_init(&rwlock, NULL);
{
::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(false);
table_proto.set_shard_num(127);
table_proto.set_task_pool_size(24);
std::cerr << "initializing begin";
distributed::GraphTable graph_table;
graph_table.initialize(table_proto);
std::cerr << "initializing done";
graph_table.load(input_file, std::string("e>"));
int sample_actual_size = -1;
int step = fixed_key_size, cur = 0;
while (sample_actual_size != 0) {
std::unique_ptr<char[]> buffer;
graph_table.pull_graph_list(cur, step, buffer, sample_actual_size, false,
1);
int index = 0;
while (index < sample_actual_size) {
paddle::distributed::FeatureNode node;
node.recover_from_buffer(buffer.get() + index);
index += node.get_size(false);
// res.push_back(node);
ids.push_back(node.get_id());
int swap_pos = rand() % ids.size();
std::swap(ids[swap_pos], ids[(int)ids.size() - 1]);
}
cur = ids.size();
// if (sample_actual_size == 0) break;
// char *buff = buffer.get();
// for (int i = 0; i < sample_actual_size/sizeof(int64_t); i++) {
// ids.push_back(*((int64_t *)buff + i));
// int swap_pos = rand() % ids.size();
// std::swap(ids[swap_pos], ids[(int)ids.size() - 1]);
// }
// cur += sample_actual_size/sizeof(int64_t);
}
std::cerr << "load ids done" << std::endl;
std::vector<int64_t> sample_id[10], sample_neighbors[10];
std::vector<int> actual_size[10];
auto func = [&rwlock, &graph_table, &ids, &sample_id, &actual_size,
&sample_neighbors, &start](int i) {
while (true) {
int s, sn;
bool exit = false;
pthread_rwlock_wrlock(&rwlock);
if (start < ids.size()) {
s = start;
sn = ids.size() - start;
sn = min(sn, fixed_key_size);
start += sn;
} else {
exit = true;
}
pthread_rwlock_unlock(&rwlock);
if (exit) break;
std::vector<std::shared_ptr<char>> buffers(sn);
std::vector<int> ac(sn);
auto status = graph_table.random_sample_neighbors(
ids.data() + s, sample_size, buffers, ac, false);
for (int j = s; j < s + sn; j++) {
sample_id[i].push_back(ids[j]);
actual_size[i].push_back(ac[j - s] / sizeof(int64_t));
int ss = ac[j - s] / sizeof(int64_t);
for (int k = 0; k < ss; k++) {
sample_neighbors[i].push_back(
*((int64_t *)(buffers[j - s].get() + k * sizeof(int64_t))));
}
}
}
VLOG(0) << "func " << i << " returns ";
};
auto start1 = std::chrono::steady_clock::now();
std::thread thr[10];
for (int i = 0; i < 10; i++) {
thr[i] = std::thread(func, i);
}
for (int i = 0; i < 10; i++) thr[i].join();
auto end1 = std::chrono::steady_clock::now();
auto tt =
std::chrono::duration_cast<std::chrono::microseconds>(end1 - start1);
std::cerr << "total time cost without cache is " << tt.count() << " us"
<< std::endl;
}
const int gpu_num = 8;
::paddle::distributed::GraphParameter table_proto;
table_proto.set_gpups_mode(true);
table_proto.set_shard_num(127);
table_proto.set_gpu_num(gpu_num);
table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
table_proto.set_gpups_graph_sample_args(std::to_string(init_search_size) +
",100000000,10000000,1,1");
std::vector<int> dev_ids;
for (int i = 0; i < gpu_num; i++) {
dev_ids.push_back(i);
}
std::shared_ptr<HeterPsResource> resource =
std::make_shared<HeterPsResource>(dev_ids);
resource->enable_p2p();
GpuPsGraphTable g(resource);
g.init_cpu_table(table_proto);
g.load(std::string(input_file), std::string("e>"));
NodeQueryResult *query_node_res;
query_node_res = g.query_node_list(0, 0, ids.size() + 10000);
VLOG(0) << "gpu got " << query_node_res->actual_sample_size << " nodes ";
VLOG(0) << "cpu got " << ids.size() << " nodes";
ASSERT_EQ((int)query_node_res->actual_sample_size, (int)ids.size());
int64_t *gpu_node_res = new int64_t[ids.size()];
cudaMemcpy(gpu_node_res, query_node_res->val, ids.size() * sizeof(int64_t),
cudaMemcpyDeviceToHost);
std::unordered_set<int64_t> cpu_node_set, gpu_node_set;
for (auto x : ids) {
cpu_node_set.insert(x);
}
for (int i = 0; i < (int)query_node_res->actual_sample_size; i++) {
auto x = gpu_node_res[i];
ASSERT_EQ(cpu_node_set.find(x) != cpu_node_set.end(), true);
gpu_node_set.insert(x);
}
VLOG(0) << " cpu_node_size = " << cpu_node_set.size();
VLOG(0) << " gpu_node_size = " << gpu_node_set.size();
ASSERT_EQ(cpu_node_set.size(), gpu_node_set.size());
for (int i = 0; i < 20; i++) {
int st = ids.size() / 20 * i;
auto q = g.query_node_list(0, st, ids.size() / 20);
VLOG(0) << " the " << i << "th iteration size = " << q->actual_sample_size;
}
// NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
/*
void *key;
cudaMalloc((void **)&key, ids.size() * sizeof(int64_t));
cudaMemcpy(key, ids.data(), ids.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
std::vector<NeighborSampleResult *> res[gpu_num];
start = 0;
auto func = [&rwlock, &g, &res, &start,
&gpu_num, &ids, &key](int i) {
while (true) {
int s, sn;
bool exit = false;
pthread_rwlock_wrlock(&rwlock);
if (start < ids.size()) {
s = start;
sn = ids.size() - start;
sn = min(sn, fixed_key_size);
start += sn;
} else {
exit = true;
}
pthread_rwlock_unlock(&rwlock);
if (exit) break;
auto r =
g.graph_neighbor_sample(i, (int64_t *)(key + s), sample_size, sn);
res[i].push_back(r);
}
};
auto start1 = std::chrono::steady_clock::now();
std::thread thr[gpu_num];
for (int i = 0; i < gpu_num; i++) {
thr[i] = std::thread(func, i);
}
for (int i = 0; i < gpu_num; i++) thr[i].join();
auto end1 = std::chrono::steady_clock::now();
auto tt =
std::chrono::duration_cast<std::chrono::microseconds>(end1 - start1);
std::cerr << "total time cost without cache is " << tt.count() << " us"
<< std::endl;
*/
#endif
}
// TEST(testSampleRate, Run) { testSampleRate(); }
int main(int argc, char *argv[]) {
for (int i = 0; i < argc; i++)
VLOG(0) << "Argument " << i << " is " << std::string(argv[i]);
if (argc > 1) {
input_file = argv[1];
} else {
prepare_file(edge_file_name, edges);
input_file = edge_file_name;
}
VLOG(0) << "input_file is " << input_file;
if (argc > 2) {
fixed_key_size = std::stoi(argv[2]);
}
VLOG(0) << "sample_node_size for every batch is " << fixed_key_size;
if (argc > 3) {
sample_size = std::stoi(argv[3]);
}
VLOG(0) << "sample_size neighbor_size is " << sample_size;
if (argc > 4) init_search_size = std::stoi(argv[4]);
VLOG(0) << " init_search_size " << init_search_size;
testSampleRate();
}
......@@ -24,7 +24,7 @@
#include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/program_desc.h"
USE_OP(mul);
USE_OP_ITSELF(mul);
USE_OP(cinn_launch);
USE_OP_ITSELF(elementwise_add);
namespace paddle::framework {
......
......@@ -234,10 +234,26 @@ void InterpreterCore::Convert(
gc_check_input_list.erase(last, gc_check_input_list.end());
for (auto var_id : gc_check_input_list) {
paddle::framework::Variable* var = global_scope_->Var(var_id);
if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
var->IsType<LoDTensorArray>()) {
vec_meta_info[var_id].var_ref_count_++;
// TODO(zhiqiu): not all var needs to be checked, var need to be checked
// only
// after the last_live_op. For example,
// b = op1(a)
// c = op2(a, b)
// in this case, a is the input of op1 and op2, we only need to check
// a after op2, because op2 always uses a after op1.
instr.AddGCCheckVar(var_id);
VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after "
<< instr.OpBase()->Type();
} else {
VLOG(4) << "not clear " << global_scope_->GetNameById(var_id)
<< " after " << instr.OpBase()->Type()
<< " because its type is "
<< framework::ToTypeName(var->Type());
}
}
}
......
......@@ -674,7 +674,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
} // namespace paddle
USE_PASS(build_cinn_pass);
USE_OP(mul);
USE_OP_ITSELF(mul);
USE_OP_ITSELF(relu);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(relu_grad);
......
......@@ -300,6 +300,6 @@ TEST(CinnCompilerTest, Compile) {
USE_PASS(build_cinn_pass);
USE_PASS(graph_viz_pass);
USE_OP(mul);
USE_OP_ITSELF(mul);
USE_OP_ITSELF(relu);
USE_OP_ITSELF(elementwise_add);
......@@ -98,4 +98,4 @@ TEST(test_var_helper, eager_var_helper) {
} // namespace imperative
} // namespace paddle
USE_OP(mul);
USE_OP_ITSELF(mul);
......@@ -28,6 +28,8 @@
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
namespace platform = paddle::platform;
namespace framework = paddle::framework;
......@@ -267,7 +269,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) {
} // namespace imperative
} // namespace paddle
USE_OP(mul);
USE_OP(mul_grad);
USE_OP_ITSELF(mul);
USE_OP_ITSELF(mul_grad);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(elementwise_add_grad);
......@@ -416,4 +416,4 @@ TEST(test_layer, test_eager) {
} // namespace imperative
} // namespace paddle
USE_OP(mul);
USE_OP_ITSELF(mul);
......@@ -34,9 +34,13 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_with_flatten_grad, GPU, ALL_LAYOUT);
#endif
namespace imperative = paddle::imperative;
......@@ -598,8 +602,8 @@ TEST(test_tracer, eager_tracer) {
} // namespace imperative
} // namespace paddle
USE_OP(mul);
USE_OP(mul_grad);
USE_OP_ITSELF(mul);
USE_OP_ITSELF(mul_grad);
USE_OP_ITSELF(reduce_sum);
USE_OP_ITSELF(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add);
......@@ -43,4 +43,4 @@ TEST(fc_op, test) {
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(mul);
USE_OP_ITSELF(mul);
......@@ -46,4 +46,4 @@ TEST(MulOpConverter, main) {
} // namespace inference
} // namespace paddle
USE_OP(mul);
USE_OP_ITSELF(mul);
......@@ -65,9 +65,10 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
for (decltype(rank) i = 0; i < rank; ++i) {
reduce_dims.push_back(i);
}
TensorReduceImpl<T, T, kernel_primitives::AddFunctor, Div>(
context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
stream);
TensorReduceImpl<T, T, kernel_primitives::AddFunctor,
kps::IdentityFunctor<T>>(
context.cuda_device_context(), *input, output,
kps::IdentityFunctor<T>(), reduce_dims, stream, true);
}
};
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/operators/mul_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace phi {
......@@ -46,6 +46,9 @@ using dnnl::memory;
using dnnl::prop_kind;
using dnnl::stream;
constexpr int kMULMKLDNNINT8 = 1;
constexpr int kMULMKLDNNFP32 = 2;
template <typename XT, typename YT, typename OT>
class MulPrimitiveFactory {
public:
......
......@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mul_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
......@@ -27,6 +27,9 @@ namespace operators {
using framework::OpKernelType;
using framework::Tensor;
constexpr int kMULMKLDNNINT8 = 1;
constexpr int kMULMKLDNNFP32 = 2;
class MulOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -354,16 +357,3 @@ REGISTER_OPERATOR(mul_grad, ops::MulGradOp,
ops::MulDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp);
REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
mul_grad_grad,
ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, double>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>,
ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
mul_grad_grad,
ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
constexpr int kMULMKLDNNINT8 = 1;
constexpr int kMULMKLDNNFP32 = 2;
template <typename DeviceContext, typename T>
class MulKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* x = context.Input<Tensor>("X");
const Tensor* y = context.Input<Tensor>("Y");
Tensor* z = context.Output<Tensor>("Out");
const Tensor x_matrix =
x->dims().size() > 2
? framework::ReshapeToMatrix(
*x, context.template Attr<int>("x_num_col_dims"))
: *x;
const Tensor y_matrix =
y->dims().size() > 2
? framework::ReshapeToMatrix(
*y, context.template Attr<int>("y_num_col_dims"))
: *y;
z->mutable_data<T>(context.GetPlace());
auto z_dim = z->dims();
if (z_dim.size() != 2) {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
blas.MatMul(x_matrix, y_matrix, z);
if (z_dim.size() != 2) {
z->Resize(z_dim);
}
}
};
template <typename DeviceContext, typename T>
class MulGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto x_matrix = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, x_num_col_dims)
: static_cast<const Tensor&>(*x);
auto y_matrix = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, y_num_col_dims)
: static_cast<const Tensor&>(*y);
auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
Tensor dout_mat;
dout_mat.ShareDataWith(*dout);
dout_mat.Resize({phi::flatten_to_2d(x->dims(), x_num_col_dims)[0],
phi::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
if (dx != nullptr) {
dx->set_lod(x->lod());
}
if (dy != nullptr) {
dy->set_lod(y->lod());
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_matrix = dx->dims().size() > 2
? framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
Tensor dy_matrix = dy->dims().size() > 2
? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
}
}
};
template <typename DeviceContext, typename T>
class MulDoubleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto x_mat = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, x_num_col_dims)
: static_cast<const Tensor&>(*x);
auto y_mat = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, y_num_col_dims)
: static_cast<const Tensor&>(*y);
const int m = phi::flatten_to_2d(x->dims(), x_num_col_dims)[0];
const int n = phi::flatten_to_2d(y->dims(), y_num_col_dims)[1];
auto* dout = ctx.Input<framework::LoDTensor>("DOut");
Tensor dout_mat;
dout_mat.ShareDataWith(*dout);
dout_mat.Resize({m, n});
auto* ddx = ctx.Input<framework::LoDTensor>("DDX");
auto* ddy = ctx.Input<framework::LoDTensor>("DDY");
auto* dx = ctx.Output<framework::LoDTensor>("DX");
auto* dy = ctx.Output<framework::LoDTensor>("DY");
auto* ddout = ctx.Output<framework::LoDTensor>("DDOut");
Tensor ddout_mat;
if (ddout) {
ddout->set_lod(dout->lod());
// allocate and reshape ddout
ddout->mutable_data<T>(ctx.GetPlace());
ddout_mat.ShareDataWith(*ddout);
ddout_mat.Resize({m, n});
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
// a flag to specify whether ddout value has been set, if flag
// is false, MatMul beta should be 0 to set ddout, if flag is
// true, MatMul beta should be 1 to add result to ddout.
bool ddout_flag = false;
if (ddx) {
auto ddx_mat = ddx->dims().size() > 2
? framework::ReshapeToMatrix(*ddx, x_num_col_dims)
: static_cast<const Tensor&>(*ddx);
// dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
if (dy) {
dy->set_lod(y->lod());
// allocate and reshape dy
dy->mutable_data<T>(ctx.GetPlace());
Tensor dy_mat = dy->dims().size() > 2
? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
}
// ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
if (ddout) {
blas.MatMul(ddx_mat, false, y_mat, false, static_cast<T>(1.0),
&ddout_mat, static_cast<T>(ddout_flag));
ddout_flag = true;
}
}
if (ddy) {
auto ddy_mat = ddy->dims().size() > 2
? framework::ReshapeToMatrix(*ddy, y_num_col_dims)
: static_cast<const Tensor&>(*ddy);
// dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
if (dx) {
dx->set_lod(x->lod());
// allocate and reshape dx
dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_mat = dx->dims().size() > 2
? framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
}
// ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
if (ddout) {
blas.MatMul(x_mat, false, ddy_mat, false, static_cast<T>(1.0),
&ddout_mat, static_cast<T>(ddout_flag));
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -15,7 +15,7 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/mul_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -14,11 +14,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/mul_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
......
......@@ -14,8 +14,13 @@ limitations under the License. */
#include <memory>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
......@@ -25,44 +30,6 @@ class MultiplexOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "Multiplex");
PADDLE_ENFORCE_NE(
ctx->Inputs("X").empty(), true,
platform::errors::InvalidArgument("MultiInput(X) shouldn't be empty."));
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multiplex");
auto ids_dim = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(
ids_dim.size(), 2,
platform::errors::PreconditionNotMet(
"The index tensor must be a vector with 2 dimensions"));
PADDLE_ENFORCE_EQ(
ids_dim[1], 1,
platform::errors::PreconditionNotMet(
"The index tensor must be a vector with batchSize x 1."));
auto ins_dims = ctx->GetInputsDim("X");
auto num_ins = ins_dims.size();
PADDLE_ENFORCE_GT(num_ins, 1,
platform::errors::InvalidArgument(
"multiplex operator should have more than "
"one candidate input tensors."));
auto in_dim = ins_dims[0];
PADDLE_ENFORCE_GE(
in_dim.size(), 2,
platform::errors::InvalidArgument(
"The rank of candidate tensors must be not less than 2."));
for (size_t i = 1; i < num_ins; i++) {
auto dim = ins_dims[i];
PADDLE_ENFORCE_EQ(
in_dim, dim,
platform::errors::PreconditionNotMet(
"All the candidate tensors must have the same size."));
}
ctx->SetOutputDim("Out", in_dim);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -164,8 +131,11 @@ class MultiplexGradMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(multiplex, MultiplexInferShapeFunctor,
PD_INFER_META(phi::MultiplexInferMeta));
REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
ops::MultiplexGradMaker<paddle::framework::OpDesc>,
ops::MultiplexGradMaker<paddle::imperative::OpBase>);
ops::MultiplexGradMaker<paddle::imperative::OpBase>,
MultiplexInferShapeFunctor);
REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
......@@ -21,6 +21,10 @@
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
......@@ -29,43 +33,6 @@ using DDim = framework::DDim;
class QrOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "qr");
OP_INOUT_CHECK(ctx->HasOutput("Q"), "Output", "Q", "qr");
OP_INOUT_CHECK(ctx->HasOutput("R"), "Output", "R", "qr");
auto x_dims = ctx->GetInputDim("X");
int x_rank = x_dims.size();
PADDLE_ENFORCE_GE(x_dims.size(), 2,
platform::errors::InvalidArgument(
"the rank of input must greater than 2"));
bool compute_q;
bool reduced_mode;
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
std::string mode = ctx->Attrs().Get<std::string>("mode");
std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
if (compute_q) {
int k = reduced_mode ? min_mn : m;
auto q_dims_vec = phi::vectorize(x_dims);
q_dims_vec[q_dims_vec.size() - 1] = k;
ctx->SetOutputDim("Q", phi::make_ddim(q_dims_vec));
} else {
ctx->SetOutputDim("Q", phi::make_ddim({0}));
}
int k = reduced_mode ? min_mn : m;
auto r_dims_vec = phi::vectorize(x_dims);
r_dims_vec[r_dims_vec.size() - 2] = k;
r_dims_vec[r_dims_vec.size() - 1] = n;
ctx->SetOutputDim("R", phi::make_ddim(r_dims_vec));
ctx->ShareLoD("X", /*->*/ "Q");
ctx->ShareLoD("X", /*->*/ "R");
}
};
class QrOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -83,10 +50,8 @@ class QrOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault("reduced");
AddComment(R"DOC(
Qr Operator.
This operator is used to perform QR operation for batched matrics $X$.
$$Q, R = qr(X)$$
)DOC");
}
};
......@@ -138,10 +103,13 @@ class QrGradMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(qr, QrInferShapeFunctor,
PD_INFER_META(phi::QrInferMeta));
REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
ops::QrGradMaker<paddle::framework::OpDesc>,
ops::QrGradMaker<paddle::imperative::OpBase>);
ops::QrGradMaker<paddle::imperative::OpBase>,
QrInferShapeFunctor);
REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
......
......@@ -33,12 +33,12 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) {
gpuStream_t stream, bool is_mean = false) {
y->mutable_data<Ty>(x.place());
phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>(
static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform,
origin_reduce_dims);
origin_reduce_dims, is_mean);
}
} // namespace operators
......
......@@ -13,29 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
class TrilTriuOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(
ctx->HasInput("X"), true,
platform::errors::NotFound("Input(X) of TrilTriuOp is not found."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("Out"), true,
platform::errors::NotFound("Output(Out) of TrilTriuOp is not found."));
const auto& x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_GE(x_dims.size(), 2,
platform::errors::InvalidArgument(
"Input(X)'s rank must be at least 2 in TrilTriuOp."));
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class TrilTriuOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -100,7 +89,10 @@ class TrilTriuGradOpMaker : public framework::SingleGradOpMaker<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
DECLARE_INFER_SHAPE_FUNCTOR(tril_triu, TrilTriuInferShapeFunctor,
PD_INFER_META(phi::TrilTriuInferMeta));
REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>,
TrilTriuInferShapeFunctor);
REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
......@@ -14,6 +14,7 @@
#pragma once
#include <iostream>
#include "paddle/phi/core/enforce.h"
static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
PyObject *kwargs) {
......@@ -33,13 +34,24 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
PyEval_RestoreThread(tstate);
tstate = nullptr;
Py_RETURN_NONE;
} catch (paddle::platform::EnforceNotMet &exception) {
if (tstate) {
PyEval_RestoreThread(tstate);
}
std::ostringstream sout;
sout << exception.what();
sout << " [operator < run_program > error]";
exception.set_error_str(sout.str());
ThrowExceptionToPython(std::current_exception());
return nullptr;
} catch (...) {
if (tstate) {
PyEval_RestoreThread(tstate);
}
ThrowExceptionToPython(std::current_exception());
return nullptr;
}
Py_RETURN_NONE;
}
static PyMethodDef CustomEagerFinalStateMethods[] = {
......
......@@ -40,6 +40,9 @@ limitations under the License. */
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace paddle {
namespace pybind {
......@@ -468,6 +471,90 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* eager_api_sparse_coo_tensor(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto non_zero_indices = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 1), 1);
auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 2), 2);
auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
PADDLE_ENFORCE(non_zero_indices.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero indices must be a DenseTensor."));
PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero elements must be a DenseTensor."));
auto dense_indices =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_indices.impl());
auto dense_elements =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_elements.impl());
// TODO(zhangkaihuo): After create SparseTensor, call coalesced() to sort and
// merge duplicate indices
std::shared_ptr<phi::SparseCooTensor> coo_tensor =
std::make_shared<phi::SparseCooTensor>(*dense_indices, *dense_elements,
phi::make_ddim(dense_shape));
paddle::experimental::Tensor tensor;
tensor.set_impl(coo_tensor);
auto name =
egr::Controller::Instance().GenerateUniqueName("generated_tensor");
tensor.set_name(name);
auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
if (!autograd_meta->GetMutableGradNode()) {
VLOG(3) << "Tensor(" << name
<< ") have not GradNode, add GradNodeAccumulation for it.";
autograd_meta->SetGradNode(
std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
}
return ToPyObject(tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* eager_api_sparse_csr_tensor(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto non_zero_crows = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
auto non_zero_cols = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 1), 1);
auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 2), 2);
auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 3), 3);
auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
PADDLE_ENFORCE(non_zero_crows.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the compressed non-zero rows must be a DenseTensor."));
PADDLE_ENFORCE(non_zero_cols.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero cols must be a DenseTensor."));
PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(),
paddle::platform::errors::Fatal(
"the non-zero elements must be a DenseTensor."));
auto dense_crows =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_crows.impl());
auto dense_cols =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_cols.impl());
auto dense_elements =
std::dynamic_pointer_cast<phi::DenseTensor>(non_zero_elements.impl());
std::shared_ptr<phi::SparseCsrTensor> csr_tensor =
std::make_shared<phi::SparseCsrTensor>(*dense_crows, *dense_cols,
*dense_elements,
phi::make_ddim(dense_shape));
paddle::experimental::Tensor tensor;
tensor.set_impl(csr_tensor);
auto name =
egr::Controller::Instance().GenerateUniqueName("generated_tensor");
tensor.set_name(name);
auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
if (!autograd_meta->GetMutableGradNode()) {
VLOG(3) << "Tensor(" << name
<< ") have not GradNode, add GradNodeAccumulation for it.";
autograd_meta->SetGradNode(
std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
}
return ToPyObject(tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyMethodDef variable_functions[] = {
// TODO(jiabin): Remove scale when we have final state tests
{"scale", (PyCFunction)(void (*)(void))eager_api_scale,
......@@ -490,6 +577,14 @@ PyMethodDef variable_functions[] = {
{"read_next_tensor_list",
(PyCFunction)(void (*)(void))eager_api_read_next_tensor_list,
METH_VARARGS | METH_KEYWORDS, NULL},
/**sparse functions**/
{"sparse_coo_tensor",
(PyCFunction)(void (*)(void))eager_api_sparse_coo_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
{"sparse_csr_tensor",
(PyCFunction)(void (*)(void))eager_api_sparse_csr_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
/**sparse functions**/
{NULL, NULL, 0, NULL}};
void BindFunctions(PyObject* module) {
......
......@@ -959,11 +959,11 @@ static PyObject* tensor__set_grad_type(TensorObject* self, PyObject* args,
EAGER_TRY
auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0);
auto grad_tensor =
egr::EagerUtils::unsafe_autograd_meta(self->tensor)->Grad();
egr::EagerUtils::unsafe_autograd_meta(self->tensor)->MutableGrad();
if (var_type == framework::proto::VarType::LOD_TENSOR) {
grad_tensor.set_impl(std::make_shared<phi::DenseTensor>());
grad_tensor->set_impl(std::make_shared<phi::DenseTensor>());
} else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
grad_tensor.set_impl(std::make_shared<phi::SelectedRows>());
grad_tensor->set_impl(std::make_shared<phi::SelectedRows>());
}
return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL
......@@ -1097,6 +1097,49 @@ static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_to_sparse_coo(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
int64_t sparse_dim = CastPyArg2AttrLong(PyTuple_GET_ITEM(args, 0), 0);
auto coo_tensor = self->tensor.to_sparse_coo(sparse_dim);
egr::EagerUtils::autograd_meta(&coo_tensor)
->SetStopGradient(
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient());
egr::EagerUtils::autograd_meta(&coo_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(coo_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_to_sparse_csr(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto csr_tensor = self->tensor.to_sparse_csr();
egr::EagerUtils::autograd_meta(&csr_tensor)
->SetStopGradient(
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient());
egr::EagerUtils::autograd_meta(&csr_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(csr_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_to_dense(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto dense_tensor = self->tensor.to_dense();
egr::EagerUtils::autograd_meta(&dense_tensor)
->SetStopGradient(
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient());
egr::EagerUtils::autograd_meta(&dense_tensor)
->SetPersistable(
egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
return ToPyObject(dense_tensor);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
......@@ -1185,6 +1228,12 @@ PyMethodDef variable_methods[] = {
METH_VARARGS | METH_KEYWORDS, NULL},
{"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr,
METH_VARARGS | METH_KEYWORDS, NULL},
{"to_sparse_coo", (PyCFunction)(void (*)(void))tensor_method_to_sparse_coo,
METH_VARARGS | METH_KEYWORDS, NULL},
{"to_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_to_sparse_csr,
METH_VARARGS | METH_KEYWORDS, NULL},
{"to_dense", (PyCFunction)(void (*)(void))tensor_method_to_dense,
METH_VARARGS | METH_KEYWORDS, NULL},
/***the method of sparse tensor****/
{"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version,
METH_VARARGS | METH_KEYWORDS, NULL},
......
......@@ -33,19 +33,21 @@ namespace tensorrt {
static nvinfer1::IBuilder* createInferBuilder(
nvinfer1::ILogger& logger) { // NOLINT
return static_cast<nvinfer1::IBuilder*>(
phi::dynload::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
::phi::dynload::createInferBuilder_INTERNAL(&logger,
NV_TENSORRT_VERSION));
}
static nvinfer1::IRuntime* createInferRuntime(
nvinfer1::ILogger& logger) { // NOLINT
return static_cast<nvinfer1::IRuntime*>(
phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
::phi::dynload::createInferRuntime_INTERNAL(&logger,
NV_TENSORRT_VERSION));
}
TrtEngine::TrtEngine(int device_id) : device_id_(device_id) {
FreshDeviceId();
logger_.reset(new TrtLogger());
builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
::phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
}
nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() {
......@@ -237,11 +239,11 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
}
void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
phi::DenseTensor t;
::phi::DenseTensor t;
outputs_.emplace(out_name, t);
}
phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
::phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
return &outputs_[name];
}
......@@ -249,7 +251,7 @@ size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
bool TrtEngine::SetUpInference(
const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs) {
const std::unordered_map<std::string, ::phi::DenseTensor*>& inputs) {
// TODO(wilber): now only create one exec_context
FreshDeviceId();
CHECK(engine_ != nullptr);
......@@ -272,7 +274,7 @@ bool TrtEngine::SetUpInference(
return true;
}
void TrtEngine::Run(const phi::GPUContext& ctx) {
void TrtEngine::Run(const ::phi::GPUContext& ctx) {
if (is_dynamic_shape_) {
DynamicRun(ctx);
} else {
......@@ -280,7 +282,7 @@ void TrtEngine::Run(const phi::GPUContext& ctx) {
}
}
void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
void TrtEngine::StaticRun(const ::phi::GPUContext& ctx) {
const int num_bindings = engine_->getNbBindings();
std::vector<void*> buffers(num_bindings, nullptr);
......@@ -291,7 +293,8 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
buffers[bind_index] =
const_cast<void*>(static_cast<const void*>(bind.buffer->data<float>()));
if (runtime_batch != -1) {
CHECK_EQ(runtime_batch, phi::vectorize<int64_t>(bind.buffer->dims())[0]);
CHECK_EQ(runtime_batch,
::phi::vectorize<int64_t>(bind.buffer->dims())[0]);
}
runtime_batch = bind.buffer->dims()[0];
}
......@@ -306,7 +309,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
for (int i = 0; i < dims.nbDims; ++i) {
ddim.push_back(dims.d[i]);
}
bind.buffer->Resize(phi::make_ddim(ddim));
bind.buffer->Resize(::phi::make_ddim(ddim));
// TODO(wilber): now only support float output.
ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
......@@ -316,7 +319,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
runtime_batch, buffers.data(), ctx.stream(), nullptr);
}
void TrtEngine::DynamicRun(const phi::GPUContext& ctx) {
void TrtEngine::DynamicRun(const ::phi::GPUContext& ctx) {
const int num_bindings = engine_->getNbBindings();
std::vector<void*> buffers(num_bindings, nullptr);
......@@ -344,7 +347,7 @@ void TrtEngine::DynamicRun(const phi::GPUContext& ctx) {
for (int i = 0; i < dims.nbDims; ++i) {
ddim[i] = dims.d[i];
}
bind.buffer->Resize(phi::make_ddim(ddim));
bind.buffer->Resize(::phi::make_ddim(ddim));
ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
}
......@@ -356,7 +359,7 @@ void TrtEngine::FreshDeviceId() {
int count;
cudaGetDeviceCount(&count);
CHECK_LT(device_id_, count);
phi::backends::gpu::SetDeviceId(device_id_);
::phi::backends::gpu::SetDeviceId(device_id_);
}
void TrtEngine::GetEngineInfo() {
......
......@@ -76,19 +76,19 @@ class TrtEngine {
const BuildOptions& build_options);
// TODO(wilber): Modify signature after infrt-trt ready.
void Run(const phi::GPUContext& ctx);
void Run(const ::phi::GPUContext& ctx);
// TODO(wilber): How to support multiple execution contexts?
bool SetUpInference(
const InferenceOptions& inference,
const std::unordered_map<std::string, phi::DenseTensor*>& inputs);
const std::unordered_map<std::string, ::phi::DenseTensor*>& inputs);
void GetEngineInfo();
void PrepareOutputHandle(const std::string& out_name);
// TODO(wilber): The output tensor names are: output_0, output_1, ...
phi::DenseTensor* GetOutput(const std::string&);
::phi::DenseTensor* GetOutput(const std::string&);
size_t GetOutputNum() const;
......@@ -104,9 +104,9 @@ class TrtEngine {
bool ModelToBuildEnv(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
const BuildOptions& build);
void StaticRun(const phi::GPUContext& ctx);
void StaticRun(const ::phi::GPUContext& ctx);
void DynamicRun(const phi::GPUContext& ctx);
void DynamicRun(const ::phi::GPUContext& ctx);
private:
std::unique_ptr<TrtLogger> logger_{nullptr};
......@@ -118,7 +118,7 @@ class TrtEngine {
std::vector<std::unique_ptr<Bindings>> bindings_;
int device_id_{0};
bool is_dynamic_shape_{false};
std::unordered_map<std::string, phi::DenseTensor> outputs_;
std::unordered_map<std::string, ::phi::DenseTensor> outputs_;
};
} // namespace tensorrt
......
......@@ -92,7 +92,7 @@ class TrtLogger : public nvinfer1::ILogger {
struct Binding {
bool is_input{false};
nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT};
phi::DenseTensor* buffer{nullptr};
::phi::DenseTensor* buffer{nullptr};
std::string name;
};
......@@ -103,7 +103,7 @@ class Bindings {
void AddBinding(int32_t b,
const std::string& name,
bool is_input,
phi::DenseTensor* buffer,
::phi::DenseTensor* buffer,
nvinfer1::DataType data_type) {
while (bindings_.size() <= static_cast<size_t>(b)) {
bindings_.emplace_back();
......
......@@ -97,4 +97,17 @@ def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
let results = (outs DenseTensor:$output);
}
// TODO(wilber): Add a infrt_gpu dialect.
def PDT_GpuMemCopyOp : PDT_Op<"memcpy.gpu", [NoSideEffect]> {
let summary = "phi_dt.gpu.memcpy";
let description = [{gpu memcpy d2h or h2d}];
// TODO(wilber): add context argument to support stream.
let arguments = (ins
DenseTensor:$input,
Context:$context,
BoolAttr:$d2h
);
let results = (outs DenseTensor:$output);
}
#endif
......@@ -97,12 +97,13 @@ void PhiOpConvertPass::convertStage() {
}
auto loc = getFunction().getLoc();
builder.setInsertionPoint(op);
if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) {
std::string kernel_name = phi::TransToPhiKernelName(op_name);
if (!::phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_name)) {
op_name = phi::TransToPhiKernelName(op_name);
auto kernel_op = builder.create<infrt::KernelOp>(loc,
op->getResultTypes(),
op->getOperands(),
kernel_name,
op_name,
op->getAttrDictionary());
op->replaceAllUsesWith(kernel_op.getResults());
} else {
......
......@@ -32,17 +32,24 @@ bool ProtoArgumentMappingContext::HasOutput(const std::string& name) const {
}
bool ProtoArgumentMappingContext::HasAttr(const std::string& name) const {
if (name == "is_test") return true;
return op_->hasAttr(name);
}
paddle::any ProtoArgumentMappingContext::Attr(const std::string& name) const {
mlir::Attribute attrs = op_->getAttr(name);
if (mlir::StringAttr str_attr = attrs.dyn_cast_or_null<mlir::StringAttr>()) {
if (name == "is_test") {
return paddle::any(true);
}
mlir::Attribute attr = op_->getAttr(name);
if (!attr) {
return paddle::any();
}
if (mlir::StringAttr str_attr = attr.dyn_cast<mlir::StringAttr>()) {
return paddle::any(str_attr.str());
} else {
}
// ToDO: implementation in the ext PR.
return paddle::any(0);
}
}
size_t ProtoArgumentMappingContext::InputSize(const std::string& name) const {
......
......@@ -6,6 +6,7 @@ gather_srcs(infrt_src SRCS
trt_op_teller_pass.cc
trt_graph_fuse_pass.cc
trt_graph_split_pass.cc
trt_type_convert_pass.cc
)
mlir_tablegen_on(trt_ops)
mlir_add_rewriter(pd_lower_to_trt)
......
......@@ -21,6 +21,26 @@
#include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
#include "paddle/infrt/host_context/core_runtime.h"
#include "paddle/infrt/host_context/kernel_registry.h"
#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
#include "paddle/infrt/kernel/basic_kernels.h"
#include "paddle/infrt/kernel/control_flow_kernels.h"
#include "paddle/infrt/kernel/tensor_kernels.h"
#include "paddle/infrt/kernel/tensor_shape_kernels.h"
#include "paddle/infrt/kernel/test_kernels.h"
#include "paddle/infrt/kernel/tensorrt/registry.h"
#ifdef INFRT_WITH_PHI
#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
#include "paddle/infrt/kernel/phi/registry.h"
#endif
int main(int argc, char** argv) {
static llvm::cl::opt<std::string> input_file(
......@@ -33,6 +53,22 @@ int main(int argc, char** argv) {
mlir::MLIRContext* context = infrt::Global::getMLIRContext();
auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
infrt::host_context::KernelRegistry registry;
::infrt::kernel::RegisterBasicKernels(&registry);
::infrt::kernel::RegisterTestKernels(&registry);
::infrt::kernel::RegisterTensorShapeKernels(&registry);
::infrt::kernel::RegisterTensorKernels(&registry);
::infrt::kernel::RegisterControlFlowKernels(&registry);
#ifdef INFRT_WITH_PHI
::infrt::kernel::RegisterPhiKernels(&registry);
::infrt::kernel::RegisterInferShapeLaunchers(&registry);
#endif
#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
::infrt::kernel::RegisterTrtKernels(&registry);
#endif
context->loadAllAvailableDialects();
module->dump();
mlir::PassManager pm(context);
......@@ -41,10 +77,12 @@ int main(int argc, char** argv) {
trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphFusePass>());
trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphSplitPass>(1));
trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpConverterPass>());
trt_pass_manager.addPass(infrt::trt::createTrtTypeConvertPass());
if (mlir::failed(pm.run(*module))) {
std::cout << "\npass failed!\n" << std::endl;
return 4;
}
module->dump();
::infrt::host_context::TestMlir(module.get(), &registry);
return 0;
}
......@@ -12,10 +12,17 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
#include <glog/logging.h>
#include <mlir/IR/Builders.h>
#include <mlir/Transforms/DialectConversion.h>
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
namespace infrt {
namespace trt {
......@@ -41,34 +48,34 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()),
trt_inputs,
true /*run_once*/);
::mlir::Block *block = new ::mlir::Block;
block->getOperations().splice(block->begin(),
auto &block = create_engine_op.body().emplaceBlock();
block.getOperations().splice(block.begin(),
casted_op.getBody()->getOperations(),
casted_op.getBody()->begin(),
casted_op.getBody()->end());
create_engine_op.body().push_back(block);
// trt.execute
// outputs
::llvm::SmallVector<::mlir::Type, 4> execute_outputs_types;
for (auto v : casted_op.getODSResults(0)) {
execute_outputs_types.push_back(v.getType());
}
// inputs
::mlir::SmallVector<::mlir::Value, 4> execute_inputs(
create_engine_op.getODSResults(0));
for (auto v : inputs) {
execute_inputs.push_back(v);
}
auto execute_op = rewriter.create<ExecuteOp>(
ods_loc, execute_outputs_types, execute_inputs);
::llvm::SmallVector<::mlir::Value, 4> replace_values;
for (auto v :
::llvm::SmallVector<::mlir::Value, 4>{execute_op.getODSResults(0)}) {
replace_values.push_back(v);
// trt.compute
::llvm::SmallVector<::mlir::Value, 4> replace_values2;
auto ctx_op = rewriter.create<::infrt::phi::CreateGPUContextOp>(
ods_loc,
infrt::phi::ContextType::get(rewriter.getContext(),
infrt::TargetType::GPU));
auto compute_op = rewriter.create<EngineComputeOp>(
ods_loc,
::infrt::DenseTensorListType::get(rewriter.getContext()),
create_engine_op.engine(),
ctx_op.output());
auto tensor_list_val = compute_op.outputs();
for (size_t i = 0; i < casted_op.getNumResults(); ++i) {
auto res = casted_op->getResult(i);
auto int_attr = mlir::IntegerAttr::get(
mlir::IntegerType::get(rewriter.getContext(), 32), i);
auto get_tensor_op = rewriter.create<::infrt::dt::TensorListGetTensorOp>(
ods_loc, res.getType(), tensor_list_val, int_attr);
replace_values2.push_back(get_tensor_op.output());
}
rewriter.replaceOp(op, replace_values);
ctx_op->moveBefore(ctx_op->getBlock(), ctx_op->getBlock()->begin());
rewriter.replaceOp(op, replace_values2);
return ::mlir::success();
}
};
......@@ -82,6 +89,9 @@ void TRTOpConverterPass::runOnOperation() {
// this lowering. In our case, we are lowering to TensorRTDialect from
// PaddleDialect
target.addLegalDialect<TensorRTDialect>();
target.addLegalDialect<::infrt::phi::PHIDialect>();
target.addLegalDialect<::infrt::dt::DTDialect>();
target.addLegalDialect<phi::PHIDenseTensorDialect>();
// Now that the conversion target has been defined, we just need to provide
// the set of patterns that will lower the TensorRT operations.
......
......@@ -14,7 +14,9 @@
#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
#include <llvm/Support/Casting.h>
#include <mlir/IR/Builders.h>
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
......@@ -35,10 +37,12 @@ void TRTOpTellerPass::runOnFunction() {
auto *op = worklist.back();
worklist.pop_back();
if (op == nullptr) continue;
if (op->getName().getStringRef().substr(0, 3) != "pd.") continue;
if (::llvm::dyn_cast_or_null<infrt::pd::FeedOp>(op)) continue;
if (::llvm::dyn_cast_or_null<infrt::pd::FetchOp>(op)) continue;
if (::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(op)) continue;
if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
builder.setInsertionPoint(op);
auto loc = getFunction().getLoc();
auto graph_op = builder.create<infrt::pd::GraphOp>(
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
#include <glog/logging.h>
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Casting.h"
#include "mlir/IR/Block.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/OperationSupport.h"
#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "paddle/infrt/dialect/infrt/common/types.h"
#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
namespace {
class TrtTypeConvertPass
: public mlir::PassWrapper<TrtTypeConvertPass, mlir::FunctionPass> {
public:
::llvm::StringRef getName() const override { return "TrtTypeConvertPass"; }
void runOnFunction() override;
};
void TrtTypeConvertPass::runOnFunction() {
mlir::Block& body = getFunction().front();
auto* mlir_ctx = getFunction()->getContext();
mlir::OpBuilder builder(&body, body.begin());
std::vector<mlir::Operation*> worklist;
mlir::Operation* ctx_op{nullptr};
worklist.reserve(body.getOperations().size());
for (auto& op : body) {
worklist.push_back(&op);
if (op.getName().getStringRef() == "phi_dt.create_context.gpu") {
ctx_op = &op;
}
}
::infrt::LayoutType layout = ::infrt::LayoutType::NCHW;
::infrt::TargetType target = ::infrt::TargetType::GPU;
for (auto& op : worklist) {
if (auto tensor_map_get_op =
llvm::dyn_cast<::infrt::phi::TensorMapGetTensorOp>(op)) {
auto res = tensor_map_get_op.output();
if (auto t = res.getType().dyn_cast<::infrt::DenseTensorType>()) {
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, t.getTarget(), t.getPrecision(), layout);
res.setType(replace_type);
}
}
if (auto create_engine = llvm::dyn_cast<::infrt::trt::CreateEngineOp>(op)) {
// Insert `infrt.gpu.memcpy` op.
for (auto arg : create_engine.getOperands()) {
if (mlir::Operation* producer = arg.getDefiningOp()) {
if (arg.getType().isa<::infrt::DenseTensorType>()) {
builder.setInsertionPointAfter(producer);
auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>();
if (producer->getName().getStringRef() !=
"phi_dt.tensor_map_get_tensor" &&
t.getTarget() != ::infrt::TargetType::GPU) {
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, target, t.getPrecision(), layout);
CHECK_NOTNULL(ctx_op);
auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>(
arg.getLoc(),
replace_type,
arg,
llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op)
.output(),
mlir::BoolAttr::get(mlir_ctx, /*d2h*/ false));
arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op);
}
}
} else {
auto blockArg = arg.cast<mlir::BlockArgument>();
if (arg.getType().isa<::infrt::DenseTensorType>()) {
auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>();
builder.setInsertionPointAfter(ctx_op);
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout);
CHECK_NOTNULL(ctx_op);
auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>(
blockArg.getLoc(),
replace_type,
blockArg,
llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op)
.output(),
mlir::BoolAttr::get(mlir_ctx, /*d2h*/ false));
arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op);
}
}
}
// Change ops(in block) types.
auto& block = create_engine.getRegion().getBlocks().front();
for (auto& op : block.without_terminator()) {
for (size_t i = 0; i < op.getNumResults(); ++i) {
if (auto t = op.getResult(i)
.getType()
.dyn_cast<::infrt::DenseTensorType>()) {
auto replace_type = ::infrt::DenseTensorType::get(
mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout);
op.getResult(i).setType(replace_type);
}
}
}
} else if (auto list_get_tensor_op =
llvm::dyn_cast<::infrt::dt::TensorListGetTensorOp>(op)) {
auto result = list_get_tensor_op.output();
if (auto t = result.getType().dyn_cast<::infrt::DenseTensorType>()) {
result.setType(::infrt::DenseTensorType::get(
mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout));
}
} else if (auto return_op = llvm::dyn_cast<::infrt::ReturnOp>(op)) {
for (auto arg : return_op->getOperands()) {
if (auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>()) {
if (t.getLayout() != ::infrt::LayoutType::ANY ||
t.getTarget() != ::infrt::TargetType::CPU ||
t.getPrecision() != ::infrt::PrecisionType::FLOAT32) {
builder.setInsertionPoint(return_op);
CHECK_NOTNULL(ctx_op);
auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>(
return_op.getLoc(),
::infrt::DenseTensorType::get(mlir_ctx,
::infrt::TargetType::CPU,
t.getPrecision(),
::infrt::LayoutType::ANY),
arg,
llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op)
.output(),
mlir::BoolAttr::get(mlir_ctx, /*d2h*/ true));
arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op);
}
}
}
}
}
}
} // namespace
namespace infrt {
namespace trt {
std::unique_ptr<mlir::Pass> createTrtTypeConvertPass() {
return std::make_unique<TrtTypeConvertPass>();
}
} // namespace trt
} // namespace infrt
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mlir/Pass/Pass.h>
namespace infrt {
namespace trt {
std::unique_ptr<mlir::Pass> createTrtTypeConvertPass();
} // namespace trt
} // namespace infrt
......@@ -130,7 +130,7 @@ boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
if (attr.isa<mlir::IntegerAttr>()) {
auto val = attr.cast<mlir::IntegerAttr>();
if (val.getType().isInteger(32)) {
return val.getInt();
return val.getValue().getSExtValue();
}
}
return boost::none;
......@@ -142,7 +142,7 @@ boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
if (attr.isa<mlir::IntegerAttr>()) {
auto val = attr.cast<mlir::IntegerAttr>();
if (val.getType().isInteger(64)) {
return val.getInt();
return val.getValue().getSExtValue();
}
}
return boost::none;
......@@ -233,7 +233,7 @@ boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
\
std::vector<type__> res; \
for (auto& v : array) { \
res.push_back(v.cast<mlir::IntegerAttr>().getInt()); \
res.push_back(v.cast<mlir::IntegerAttr>().getValue().getSExtValue()); \
} \
return res; \
}
......@@ -309,7 +309,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
arg_value = GetOpResult(upstream_op);
}
}
if (arg_value->is_type<phi::DenseTensor>()) {
if (arg_value->is_type<::phi::DenseTensor>()) {
impl_->runtime->FeedInArgs(
std::make_pair(std::to_string(i), ValueRef(arg_value)));
}
......
......@@ -147,6 +147,7 @@ class Value : public common::Object {
#endif
explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
explicit Value(::phi::MetaConfig&& x) : data(std::move(x)) {}
#ifdef INFRT_WITH_TRT
explicit Value(::infrt::backends::tensorrt::TrtEngine&& x)
: data(std::move(x)) {}
......
......@@ -30,6 +30,7 @@ namespace phi {
::phi::GPUContext context;
context.PartialInitWithoutAllocator();
context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{});
context.SetHostAllocator(new backends::CpuPhiAllocator{});
context.PartialInitWithAllocator();
return context;
}
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
#include "llvm/Support/ErrorHandling.h"
#include "paddle/infrt/common/string.h"
#include "paddle/infrt/dialect/phi/data_type.h"
#include "paddle/infrt/kernel/phi/context_kernels.h"
......@@ -228,6 +229,69 @@ int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map) {
return map.size();
}
#ifdef INFRT_WITH_GPU
inline size_t SizeOfDataType(::phi::DataType data_type) {
switch (data_type) {
case ::phi::DataType::BOOL:
case ::phi::DataType::UINT8:
case ::phi::DataType::INT8:
return 1;
case ::phi::DataType::BFLOAT16:
case ::phi::DataType::FLOAT16:
case ::phi::DataType::INT16:
case ::phi::DataType::UINT16:
return 2;
case ::phi::DataType::FLOAT32:
case ::phi::DataType::INT32:
case ::phi::DataType::UINT32:
return 4;
case ::phi::DataType::FLOAT64:
case ::phi::DataType::INT64:
case ::phi::DataType::UINT64:
case ::phi::DataType::COMPLEX64:
return 8;
case ::phi::DataType::COMPLEX128:
return 16;
case ::phi::DataType::UNDEFINED:
return 0;
default:
llvm_unreachable("should not reach here");
return 0;
}
return 0;
}
::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input,
const ::phi::GPUContext& context,
bool d2h) {
if (d2h) {
::phi::DenseTensor ret(
const_cast<::phi::Allocator*>(&context.GetHostAllocator()),
input.meta());
CHECK(input.place().GetType() == ::phi::AllocationType::GPU);
// TODO(wilber): Add sync op and stream.
cudaMemcpyAsync(ret.data(),
input.data(),
SizeOfDataType(input.dtype()) * input.numel(),
cudaMemcpyDeviceToHost,
nullptr);
return ret;
} else {
// h2d
::phi::DenseTensor ret(
const_cast<::phi::Allocator*>(&context.GetAllocator()), input.meta());
CHECK(input.place().GetType() == ::phi::AllocationType::CPU ||
input.place().GetType() == ::phi::AllocationType::GPUPINNED);
// TODO(wilber): Add sync op and stream.
cudaMemcpyAsync(ret.data(),
input.data(),
SizeOfDataType(input.dtype()) * input.numel(),
cudaMemcpyHostToDevice,
nullptr);
return ret;
}
}
#endif
} // namespace phi
} // namespace kernel
} // namespace infrt
......@@ -18,6 +18,7 @@
#include "paddle/infrt/dialect/infrt/common/types.h"
#include "paddle/infrt/host_context/kernel_utils.h"
#include "paddle/infrt/tensor/phi/tensor_map.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
namespace infrt {
......@@ -55,6 +56,12 @@ infrt::phi::DenseTensorMap LoadParams(
int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map);
#ifdef INFRT_WITH_GPU
::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input,
const ::phi::GPUContext& context,
bool d2h);
#endif
} // namespace phi
} // namespace kernel
} // namespace infrt
......@@ -14,6 +14,7 @@
#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/meta_tensor.h"
namespace infrt {
namespace kernel {
......@@ -31,6 +32,10 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
infershape_kernel_frame_builder.AddArgument(value);
}
}
if (infershape_kernel_frame_builder.GetNumArgs() < arg_size_) {
infershape_kernel_frame_builder.AddArgument(
new host_context::Value(::phi::MetaConfig()));
}
}
void InferShapedKernelLauncher::BuildInferShapeCache(
......
......@@ -22,11 +22,8 @@ namespace infrt {
namespace kernel {
struct InferShapedKernelLauncher {
virtual void Invoke(host_context::KernelFrame* frame) = 0;
virtual ~InferShapedKernelLauncher() = default;
protected:
explicit InferShapedKernelLauncher(int arg_size) : arg_size_(arg_size) {}
~InferShapedKernelLauncher() = default;
//! Initialize the kernel frame for InferShape kernel.
// This method will create a new KernelFrame with all the Tensors(currently
// only DenseHostTensor) converted into MetaTensors so that the infer-shape
......@@ -46,6 +43,7 @@ struct InferShapedKernelLauncher {
llvm::SmallVector<host_context::ValueRef, 3> values;
llvm::SmallVector<::phi::DDim, 3> tensor_shape_cache;
host_context::KernelFrameBuilder infershape_kernel_frame_builder;
const int arg_size_;
};
} // namespace kernel
......
......@@ -24,46 +24,44 @@
namespace infrt {
namespace kernel {
template <typename F>
struct FuncArgStatics {};
template <typename Return, typename... Args>
struct FuncArgStatics<Return (*)(Args...)> {
constexpr static int arg_size = sizeof...(Args);
};
template <typename KernelFunc,
KernelFunc kernel,
typename InferShapedFunc,
InferShapedFunc infershape>
class KernelLauncher : public InferShapedKernelLauncher {
public:
void KernelLauncherFunc(host_context::KernelFrame* frame) {
static InferShapedKernelLauncher launcher(
FuncArgStatics<InferShapedFunc>::arg_size);
static const uint16_t num_input_tensors{InferShapeHelper<KernelFunc>::count};
static const bool turn_on_infer_shape_cache{true};
void Invoke(host_context::KernelFrame* frame) override {
#ifndef NDEBUG
LOG(INFO) << "Kernel.frame: " << frame->DumpArgTypes();
#endif
// Build the infershape KernelFrame if needed.
// TODO(Superjomn) add unlikely here.
if (infershape_kernel_frame_builder.IsEmpty()) {
CreateKernelFrameForInferShape(frame);
if (launcher.infershape_kernel_frame_builder.IsEmpty()) {
launcher.CreateKernelFrameForInferShape(frame);
#ifndef NDEBUG
LOG(INFO) << "infershape.frame: "
<< infershape_kernel_frame_builder.DumpArgTypes();
<< launcher.infershape_kernel_frame_builder.DumpArgTypes();
#endif
}
if (turn_on_infer_shape_cache) {
if (!turn_on_infer_shape_cache || IsShapeChanged(num_input_tensors)) {
if (launcher.IsShapeChanged(num_input_tensors)) {
::infrt::host_context::KernelImpl<InferShapedFunc, infershape>::Invoke(
&infershape_kernel_frame_builder);
BuildInferShapeCache(num_input_tensors);
&launcher.infershape_kernel_frame_builder);
launcher.BuildInferShapeCache(num_input_tensors);
}
}
::infrt::host_context::KernelImpl<KernelFunc, kernel>::Invoke(frame);
}
};
template <typename KernelFunc,
KernelFunc kernel,
typename InferShapedFunc,
InferShapedFunc infershape>
void KernelLauncherFunc(
KernelLauncher<KernelFunc, kernel, InferShapedFunc, infershape> launcher,
host_context::KernelFrame* frame) {
launcher.Invoke(frame);
}
} // namespace kernel
......
......@@ -52,6 +52,9 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
"phi_dt.create_dense_tensor.gpu",
INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor),
{"dims", "lod", "layout", "precision"});
registry->AddKernelWithAttrs("phi_dt.memcpy.gpu",
INFRT_KERNEL(infrt::kernel::phi::GpuMemCpy),
{"d2h"});
#endif
registry->AddKernelWithAttrs("phi_dt.load_params",
INFRT_KERNEL(infrt::kernel::phi::LoadParams),
......
......@@ -14,6 +14,7 @@
#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
#include <string>
#include <unordered_set>
#include "NvInfer.h"
#include "NvInferRuntime.h"
#include "NvInferRuntimeCommon.h"
......@@ -68,7 +69,7 @@ namespace tensorrt {
auto& region = operation.getRegion(0);
auto& block = region.getBlocks().front();
std::unordered_map<std::string, phi::DenseTensor*> trt_bind_inputs;
std::unordered_map<std::string, ::phi::DenseTensor*> trt_bind_inputs;
ValueToITensorMap value_to_trt_tensor_map;
ValueToTensorMap value_to_tensor_map;
......@@ -79,7 +80,7 @@ namespace tensorrt {
const std::string input_name = "input_" + std::to_string(idx);
auto* v = symbol_table->GetValue(std::to_string(idx));
CHECK_NOTNULL(v);
auto* t = &v->get<phi::DenseTensor>();
auto* t = &v->get<::phi::DenseTensor>();
value_to_tensor_map[operand] = t;
// TODO(wilber): get input info from mlir.
......@@ -93,7 +94,7 @@ namespace tensorrt {
if (operand.isa<mlir::BlockArgument>()) {
// TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU
// tensor, so we treat all GPU tensors as inputs to trt.
if (t->place().GetType() == phi::AllocationType::GPU) {
if (t->place().GetType() == ::phi::AllocationType::GPU) {
trt_bind_inputs[input_name] = t;
nvinfer1::Dims dims;
dims.nbDims = t->dims().size() - 1;
......@@ -106,8 +107,10 @@ namespace tensorrt {
}
} else {
// TODO(wilber): Replace with the op name that generates the weights.
if (operand.getDefiningOp()->getName().getStringRef() !=
"phi_dt.create_dense_tensor.cpu") {
std::unordered_set<std::string> weight_flags{
"phi_dt.tensor_map_get_tensor", "phi_dt.create_dense_tensor.cpu"};
if (!weight_flags.count(
operand.getDefiningOp()->getName().getStringRef().str())) {
trt_bind_inputs[input_name] = t;
nvinfer1::Dims dims;
dims.nbDims = t->dims().size() - 1;
......@@ -167,10 +170,10 @@ void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) {
engine->GetEngineInfo();
}
std::vector<phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) {
std::vector<::phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const ::phi::GPUContext& context) {
engine->Run(context);
std::vector<phi::DenseTensor*> res;
std::vector<::phi::DenseTensor*> res;
for (size_t i = 0; i < engine->GetOutputNum(); ++i) {
res.push_back(engine->GetOutput("output_" + std::to_string(i)));
}
......
......@@ -41,8 +41,8 @@ struct MlirOperationWithInfrtSymbol {
void PrintTrtLayer(backends::tensorrt::TrtEngine* engine);
std::vector<phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context);
std::vector<::phi::DenseTensor*> TrtEngineCompute(
backends::tensorrt::TrtEngine* engine, const ::phi::GPUContext& context);
} // namespace tensorrt
} // namespace kernel
......
......@@ -7,3 +7,4 @@ add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir)
// RUN: infrtexec -i %s
module {
func @predict(%arg0: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> {
func @predict(%arg0: !infrt.dense_tensor<CPU, FP32, NCHW>,%filter: !infrt.dense_tensor<CPU, FP32, NCHW>, %arg1: !infrt.dense_tensor<CPU, FP32, NCHW>, %arg2: !infrt.dense_tensor<CPU, FP32, NCHW>, %arg3: !infrt.dense_tensor<CPU, FP32, NCHW>, %arg4: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> {
%2 = "pd.abs"(%arg0) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
infrt.return %2 : !infrt.dense_tensor<CPU, FP32, NCHW>
%3 = "pd.matmul_v2"(%arg0, %2) {trans_x = false, trans_y = false} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
%4 = "pd.conv2d"(%3, %filter) {data_format = "NCHW", dilations = [1 : i32, 1 : i32], groups = 1 : si32, padding_algorithm = "EXPLICIT", paddings = [1 : i32, 1 : i32], strides = [2 : i32, 2 : i32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
%Y, %MeanOut, %VarianceOut = "pd.batch_norm"(%4, %arg1, %arg2, %arg3, %arg4) {data_layout = "NCHW", epsilon = 9.99999974E-6 : f32, momentum = 0.899999976 : f32} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
%out = "pd.relu"(%Y) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
%5 = "pd.elementwise_add"(%out, %out) {axis = -1:si32} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
infrt.return %5 : !infrt.dense_tensor<CPU, FP32, NCHW>
}
func @main() {
%ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
%t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
%t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[1, 3, 8, 8]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%2 = infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
%filter = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3, 3, 8, 8]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%filter) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%bias = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%bias) {value=[1.5:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%mean = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%mean) {value=[3.5:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%scale = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%scale) {value=[1.0:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%var = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1], dims=[3]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%var) {value=[0.0:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%2 = infrt.call@predict(%t, %filter, %bias, %mean, %scale, %var) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>,!infrt.dense_tensor<CPU, FP32, NCHW>,!infrt.dense_tensor<CPU, FP32, NCHW>,!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
//phi_dt.print_tensor(%t : !infrt.dense_tensor<CPU, FP32, NCHW>)
phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
infrt.return
}
......
module {
func @main_graph(%map: !phi.dense_tensor_map, %arg0: !infrt.dense_tensor<CPU, FP32, ANY>) -> !infrt.dense_tensor<CPU, FP32, ANY> {
%0 = "phi_dt.create_context.gpu"() : () -> !phi.context<GPU>
%1 = "phi_dt.memcpy.gpu"(%arg0, %0) {d2h = false} : (!infrt.dense_tensor<CPU, FP32, ANY>, !phi.context<GPU>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%3 = phi_dt.tensor_map_get_tensor(%map) {name = "linear_0.b_0"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
%4 = phi_dt.tensor_map_get_tensor(%map) {name = "linear_0.w_0"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
%5 = "trt.create_engine"(%1, %4, %3) ( {
%10 = "trt.FullyConnected"(%1, %4, %3) {out_channel_num = 10 : si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
infrt.return %10 : !infrt.dense_tensor<GPU, FP32, NCHW>
}) {run_once = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
%6 = "trt.compute"(%5, %0) : (!trt.engine, !phi.context<GPU>) -> !infrt.tensor_list
%7 = "dt.tensor_list_get_tensor"(%6) {id = 0 : i32} : (!infrt.tensor_list) -> !infrt.dense_tensor<GPU, FP32, NCHW>
%8 = "phi_dt.memcpy.gpu"(%7, %0) {d2h = true} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> !infrt.dense_tensor<CPU, FP32, ANY>
infrt.return %8 : !infrt.dense_tensor<CPU, FP32, ANY>
}
func @main() {
%map = phi_dt.load_combined_params(){model_path="@CMAKE_BINARY_DIR@/linear/linear.pdmodel",
params_path="@CMAKE_BINARY_DIR@/linear/linear.pdiparams"}
%ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
%input_tensor = "phi_dt.create_dense_tensor.cpu" (%ctx) {
precision=#infrt.precision<FP32>,
layout=#infrt.layout<NCHW>,
dims=[3:i64, 784:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
"phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
%res = infrt.call @main_graph(%map, %input_tensor) {} : (!phi.dense_tensor_map, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
"phi_dt.print_tensor" (%res) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
infrt.return
}
}
......@@ -518,6 +518,30 @@ class PADDLE_API Tensor final {
/* Part 10: Auto generated Tensor methods */
/* Part 11: Methods of converting SparseTensor and DenseTensor to each other
*/
/**
* @brief Convert DenseTensor or SparseCsrTensor to SparseCooTensor
*
* @param sparse_dim, The number of sparse dimensions
* @return Tensor
*/
Tensor to_sparse_coo(const int64_t sparse_dim) const;
/**
* @brief Convert DenseTensor or SparseCooTensor to SparseCsrTensor
*
* @return Tensor
*/
Tensor to_sparse_csr() const;
/**
* @brief Convert SparseCooTensor or SparseCsrTensor to DenseTensor
*
* @return Tensor
*/
Tensor to_dense() const;
private:
/**
* [ Why use abstract TensorImpl interface here? ]
......
......@@ -149,4 +149,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph
cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta)
cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api)
......@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/tensor_base.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/infermeta/unary.h"
......@@ -183,5 +184,17 @@ void Tensor::copy_(const Tensor &src,
}
}
Tensor Tensor::to_sparse_coo(const int64_t sparse_dim) const {
return experimental::sparse::to_sparse_coo(*this, sparse_dim);
}
Tensor Tensor::to_sparse_csr() const {
return experimental::sparse::to_sparse_csr(*this);
}
Tensor Tensor::to_dense() const {
return experimental::sparse::to_dense(*this);
}
} // namespace experimental
} // namespace paddle
......@@ -16,16 +16,18 @@
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
#include <ThreadPool.h>
namespace phi {
CallbackManager::CallbackManager(stream::Stream *stream)
: stream_(stream), thread_pool_(1) {}
: stream_(stream), thread_pool_(new ::ThreadPool(1)) {}
void CallbackManager::AddCallback(std::function<void()> callback) const {
auto *callback_func = new std::function<void()>(std::move(callback));
auto *func = new std::function<void()>([this, callback_func] {
std::lock_guard<std::mutex> lock(mtx_);
last_future_ = thread_pool_.enqueue([callback_func] {
last_future_ = thread_pool_->enqueue([callback_func] {
std::unique_ptr<std::function<void()>> releaser(callback_func);
(*callback_func)();
});
......
......@@ -14,8 +14,6 @@
#pragma once
#include <ThreadPool.h>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
......@@ -30,6 +28,8 @@
#include <memory>
#include <mutex> // NOLINT
class ThreadPool;
namespace phi {
namespace stream {
......@@ -50,7 +50,7 @@ class CallbackManager {
private:
stream::Stream* stream_;
mutable ::ThreadPool thread_pool_;
mutable std::shared_ptr<::ThreadPool> thread_pool_;
mutable std::mutex mtx_;
mutable std::future<void> last_future_;
};
......
......@@ -14,6 +14,8 @@
#pragma once
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include <vector>
#include "paddle/phi/backends/event.h"
#include "paddle/phi/backends/stream.h"
......
......@@ -124,6 +124,10 @@ class OpUtilsMap {
{std::move(op_type), std::move(base_kernel_name)});
}
bool HasArgumentMappingFn(const std::string& op_type) const {
return arg_mapping_fn_map_.count(op_type);
}
void InsertArgumentMappingFn(std::string op_type, ArgumentMappingFn fn) {
PADDLE_ENFORCE_EQ(
arg_mapping_fn_map_.count(op_type),
......
......@@ -832,6 +832,50 @@ void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out) {
out->share_lod(*x.at(0));
}
void MultiplexInferMeta(const std::vector<MetaTensor*>& ins,
const MetaTensor& ids,
MetaTensor* out) {
PADDLE_ENFORCE_NE(
ins.empty(),
true,
phi::errors::InvalidArgument("MultiInput(X) shouldn't be empty."));
auto ids_dim = ids.dims();
PADDLE_ENFORCE_EQ(ids_dim.size(),
2,
phi::errors::PreconditionNotMet(
"The index tensor must be a vector with 2 dimensions"));
PADDLE_ENFORCE_EQ(
ids_dim[1],
1,
phi::errors::PreconditionNotMet(
"The index tensor must be a vector with batchSize x 1."));
auto ins_dims = GetMetaTensorsDim(ins);
auto num_ins = ins_dims.size();
PADDLE_ENFORCE_GT(
num_ins,
1,
phi::errors::InvalidArgument("multiplex operator should have more than "
"one candidate input tensors."));
auto in_dim = ins_dims[0];
PADDLE_ENFORCE_GE(
in_dim.size(),
2,
phi::errors::InvalidArgument(
"The rank of candidate tensors must be not less than 2."));
for (size_t i = 1; i < num_ins; i++) {
auto dim = ins_dims[i];
PADDLE_ENFORCE_EQ(
in_dim,
dim,
phi::errors::PreconditionNotMet(
"All the candidate tensors must have the same size."));
}
out->set_dims(in_dim);
out->set_dtype(ins[0]->dtype());
}
void PsroiPoolInferMeta(const MetaTensor& x,
const MetaTensor& rois,
paddle::optional<const MetaTensor&> rois_num,
......
......@@ -152,6 +152,10 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x,
void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out);
void MultiplexInferMeta(const std::vector<MetaTensor*>& ins,
const MetaTensor& ids,
MetaTensor* out);
void PsroiPoolInferMeta(const MetaTensor& x,
const MetaTensor& rois,
paddle::optional<const MetaTensor&> rois_num,
......
......@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/phi/common/type_traits.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/kernels/funcs/parse_qr_mode.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/unfold_functor.h"
#include "paddle/phi/kernels/funcs/unsqueeze.h"
......@@ -1129,6 +1130,44 @@ void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
out->set_layout(x.layout());
}
void QrInferMeta(const MetaTensor& x,
const std::string& mode,
MetaTensor* q,
MetaTensor* r) {
auto x_dims = x.dims();
int x_rank = x_dims.size();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
phi::errors::InvalidArgument("the rank of input must greater than 2"));
bool compute_q;
bool reduced_mode;
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
std::tie(compute_q, reduced_mode) = phi::funcs::ParseQrMode(mode);
if (compute_q) {
int k = reduced_mode ? min_mn : m;
auto q_dims_vec = phi::vectorize(x_dims);
q_dims_vec[q_dims_vec.size() - 1] = k;
q->set_dims(phi::make_ddim(q_dims_vec));
} else {
q->set_dims(phi::make_ddim({0}));
}
int k = reduced_mode ? min_mn : m;
auto r_dims_vec = phi::vectorize(x_dims);
r_dims_vec[r_dims_vec.size() - 2] = k;
r_dims_vec[r_dims_vec.size() - 1] = n;
r->set_dims(phi::make_ddim(r_dims_vec));
q->share_lod(x);
r->share_lod(x);
q->set_dtype(x.dtype());
r->set_dtype(x.dtype());
}
DDim ReduceInferDim(const MetaTensor& x,
const std::vector<int64_t>& axis,
bool keep_dim,
......@@ -1847,6 +1886,20 @@ void UnbindInferMeta(const MetaTensor& x,
}
}
void TrilTriuInferMeta(const MetaTensor& x,
int diagonal,
bool lower,
MetaTensor* out) {
const auto& x_dims = x.dims();
PADDLE_ENFORCE_GE(x_dims.size(),
2,
phi::errors::InvalidArgument(
"Input(X)'s rank must be at least 2 in TrilTriuOp."));
out->set_dims(x.dims());
out->share_lod(x);
out->set_dtype(x.dtype());
}
void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
out->share_meta(x);
}
......
......@@ -180,6 +180,11 @@ void PoolInferMeta(const MetaTensor& x,
MetaTensor* out,
MetaConfig config = MetaConfig());
void QrInferMeta(const MetaTensor& x,
const std::string& mode,
MetaTensor* q,
MetaTensor* r);
void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
void ReduceInferMeta(const MetaTensor& x,
......@@ -282,6 +287,11 @@ void TransposeGradInferMeta(const MetaTensor& x,
const std::vector<int>& axis,
MetaTensor* out);
void TrilTriuInferMeta(const MetaTensor& x,
int diagonal,
bool lower,
MetaTensor* out);
void UnbindInferMeta(const MetaTensor& x,
int axis,
std::vector<MetaTensor>* outs);
......
......@@ -62,6 +62,8 @@ register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS $
# phi sparse kernels
add_subdirectory(sparse)
# phi selected_rows kernels
add_subdirectory(selected_rows)
copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
......
......@@ -45,3 +45,17 @@ PD_REGISTER_KERNEL(matmul_triple_grad,
double,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(matmul_with_flatten_grad,
CPU,
ALL_LAYOUT,
phi::MatmulWithFlattenGradKernel,
float,
double) {}
PD_REGISTER_KERNEL(matmul_with_flatten_double_grad,
CPU,
ALL_LAYOUT,
phi::MatmulWithFlattenDoubleGradKernel,
float,
double) {}
......@@ -28,3 +28,10 @@ PD_REGISTER_KERNEL(matmul,
double,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(matmul_with_flatten,
CPU,
ALL_LAYOUT,
phi::MatmulWithFlattenKernel,
float,
double) {}
......@@ -19,30 +19,10 @@
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/parse_qr_mode.h"
namespace phi {
static inline std::tuple<bool, bool> ParseQrMode(const std::string& mode) {
bool compute_q;
bool reduced;
if (mode == "reduced") {
compute_q = true;
reduced = true;
} else if (mode == "complete") {
compute_q = true;
reduced = false;
} else if (mode == "r") {
compute_q = false;
reduced = true;
} else {
PADDLE_THROW(errors::InvalidArgument(
"QR received unrecognized mode '%s'"
" but expected one of 'reduced' (default), 'r', or 'complete'",
mode));
}
return std::make_tuple(compute_q, reduced);
}
template <typename T, typename Context>
void QrKernel(const Context& ctx,
const DenseTensor& x,
......@@ -51,7 +31,7 @@ void QrKernel(const Context& ctx,
DenseTensor* r) {
bool compute_q;
bool reduced_mode;
std::tie(compute_q, reduced_mode) = ParseQrMode(mode);
std::tie(compute_q, reduced_mode) = phi::funcs::ParseQrMode(mode);
auto numel = x.numel();
PADDLE_ENFORCE_GT(
numel, 0, errors::PreconditionNotMet("The input of QR is empty."));
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace phi {
namespace funcs {
static inline std::tuple<bool, bool> ParseQrMode(const std::string& mode) {
bool compute_q;
bool reduced;
if (mode == "reduced") {
compute_q = true;
reduced = true;
} else if (mode == "complete") {
compute_q = true;
reduced = false;
} else if (mode == "r") {
compute_q = false;
reduced = true;
} else {
PADDLE_THROW(errors::InvalidArgument(
"QR received unrecognized mode '%s'"
" but expected one of 'reduced' (default), 'r', or 'complete'",
mode));
}
return std::make_tuple(compute_q, reduced);
}
} // namespace funcs
} // namespace phi
......@@ -453,25 +453,20 @@ struct ReduceConfig {
void SetReduceType() {
int rank = x_dim.size();
int reduce_rank = reduce_dim.size();
bool is_last_dim =
(rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
if (rank == reduce_rank || is_last_dim) {
#ifdef PADDLE_WITH_XPU_KP
reduce_type = static_cast<int>(ReduceType::kReduceAny);
bool not_higher = x_dim[0] > 1;
#else
reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
int device_id = paddle::platform::GetCurrentDeviceId();
int max_grid_z = phi::backends::gpu::GetGpuMaxGridDimSize(device_id)[2];
bool not_higher = x_dim[0] >= max_grid_z;
#endif
if (reduce_last_dim && (reduce_rank == 1)) {
reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
} else if (reduce_rank == 1) {
// ReduceFirstDim and reduceSecondDim
#ifdef PADDLE_WITH_XPU_KP
if (reduce_dim[0] == 0) {
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
} else {
if (rank == 3 && not_higher) {
reduce_type = static_cast<int>(ReduceType::kReduceAny);
}
#else
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
#endif
} else {
reduce_type = static_cast<int>(ReduceType::kReduceAny);
}
......@@ -648,7 +643,8 @@ __global__ void ReduceAnyKernel(const Tx* x,
bool reduce_last_dim,
const Calculator reduce_index_calculator,
const Calculator left_index_calculator,
const kps::DimConfig dim) {
const kps::DimConfig dim,
bool is_mean) {
int input_idx, left_idx, stride;
int block_size = 0;
bool need_store = true;
......@@ -752,7 +748,9 @@ __global__ void ReduceAnyKernel(const Tx* x,
kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
&reduce_var, &reduce_var, reducer, reduce_last_dim);
if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(reduce_num);
}
Ty result = static_cast<Ty>(reduce_var);
kps::details::WriteData<Ty>(
y + store_offset + i, &result, static_cast<int>(need_store));
......@@ -772,7 +770,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
int reduce_num,
int left_num,
int blocking_size,
const kps::DimConfig dim) {
const kps::DimConfig dim,
int mean_div,
bool is_mean) {
// when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
// function will be used
auto block = ReduceIndexMapping<false>(dim);
......@@ -806,6 +806,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false);
}
if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(mean_div);
}
Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, false>(
y + store_offset + idx, &result, block.BlockDimX());
......@@ -831,6 +834,10 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false);
}
if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(mean_div);
}
Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, true>(
y + store_offset + idx, &result, dim.rem_x);
......@@ -848,7 +855,8 @@ static void LaunchReduceKernel(const Tx* x_data,
const TransformOp& transform,
MPType init,
KPStream stream,
ReduceConfig<Ty> config) {
ReduceConfig<Ty> config,
bool is_mean = false) {
if (config.reduce_type == kReduceLastDim) {
int stride_reduce = 1;
int stride_left = config.reduce_num;
......@@ -887,7 +895,8 @@ static void LaunchReduceKernel(const Tx* x_data,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
dim,
is_mean && (!config.should_reduce_again));
} else {
int reduce_rank = config.reduce_strides.size();
......@@ -930,7 +939,8 @@ static void LaunchReduceKernel(const Tx* x_data,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
dim,
is_mean && (!config.should_reduce_again));
}
if (config.should_reduce_again) {
......@@ -950,15 +960,18 @@ static void LaunchReduceKernel(const Tx* x_data,
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP
grid = 8;
block = 64;
int grid_size = 8;
int block_size = 64;
#else
auto grid_size = grid;
auto block_size = block;
#endif
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
kps::IdentityFunctor<Ty, MPType>><<<grid_size, block_size, 0, stream>>>(
config.output_data,
y_data,
reducer,
......@@ -967,7 +980,9 @@ static void LaunchReduceKernel(const Tx* x_data,
config.grid.y,
config.left_num,
config.grid.y,
dim);
dim,
config.reduce_num,
is_mean);
}
}
......@@ -1034,7 +1049,8 @@ void ReduceKernel(const KPDevice& dev_ctx,
const phi::DenseTensor& x,
phi::DenseTensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims) {
const std::vector<int>& origin_reduce_dims,
bool is_mean = false) {
#ifdef PADDLE_WITH_XPU_KP
auto stream = dev_ctx.x_context()->xpu_stream;
#else
......@@ -1069,8 +1085,18 @@ void ReduceKernel(const KPDevice& dev_ctx,
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
#ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) {
if (is_mean) {
using Div = kps::DivideFunctor<Tx>;
CubTensorReduceImpl<Tx, Ty, ReduceOp, Div>(x_data,
y_data,
Div(config.reduce_num),
config.reduce_num,
dev_ctx,
stream);
} else {
CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
x_data, y_data, transform, config.reduce_num, dev_ctx, stream);
}
return;
}
#endif
......@@ -1115,7 +1141,9 @@ void ReduceKernel(const KPDevice& dev_ctx,
config.reduce_num,
config.left_num,
config.blocking_size,
dim);
dim,
config.reduce_num,
is_mean && (!config.should_reduce_again));
if (config.should_reduce_again) {
dim3 block = dim3(config.block.x, 1, 1);
......@@ -1125,15 +1153,19 @@ void ReduceKernel(const KPDevice& dev_ctx,
dim2.SetRem(config.left_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP
grid = 8;
block = 64;
int grid_size = 8;
int block_size = 64;
#else
auto grid_size = grid;
auto block_size = block;
#endif
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp<MPType>,
kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
kps::IdentityFunctor<Ty,
MPType>><<<grid_size, block_size, 0, stream>>>(
config.output_data,
y_data,
reducer,
......@@ -1142,7 +1174,9 @@ void ReduceKernel(const KPDevice& dev_ctx,
config.grid.y,
config.left_num,
config.grid.y,
dim2);
dim2,
config.reduce_num,
is_mean);
}
return;
}
......@@ -1151,7 +1185,14 @@ void ReduceKernel(const KPDevice& dev_ctx,
// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
// function will be used
LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
x_data, y_data, reducer, transform, reducer.initial(), stream, config);
x_data,
y_data,
reducer,
transform,
reducer.initial(),
stream,
config,
is_mean);
}
} // namespace funcs
......
......@@ -49,3 +49,19 @@ PD_REGISTER_KERNEL(matmul_triple_grad,
phi::dtype::float16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(matmul_with_flatten_grad,
GPU,
ALL_LAYOUT,
phi::MatmulWithFlattenGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(matmul_with_flatten_double_grad,
GPU,
ALL_LAYOUT,
phi::MatmulWithFlattenDoubleGradKernel,
float,
double,
phi::dtype::float16) {}
......@@ -30,3 +30,11 @@ PD_REGISTER_KERNEL(matmul,
phi::dtype::bfloat16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(matmul_with_flatten,
GPU,
ALL_LAYOUT,
phi::MatmulWithFlattenKernel,
float,
double,
phi::dtype::float16) {}
......@@ -30,7 +30,8 @@ void Reduce(const KPDevice& dev_ctx,
const std::vector<int64_t>& dims,
bool keep_dim,
DataType out_dtype,
DenseTensor* out) {
DenseTensor* out,
bool is_mean = false) {
std::vector<int> reduce_dims =
phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all);
......@@ -57,12 +58,18 @@ void Reduce(const KPDevice& dev_ctx,
tmp_tensor,
out,
TransformOp<data_t, MPType>(reduce_num),
reduce_dims);
reduce_dims,
is_mean);
}));
} else {
using MPType = typename kps::details::MPTypeTrait<T>::Type;
phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
dev_ctx, x, out, TransformOp<T, MPType>(reduce_num), reduce_dims);
dev_ctx,
x,
out,
TransformOp<T, MPType>(reduce_num),
reduce_dims,
is_mean);
}
}
} // namespace phi
......
......@@ -27,8 +27,8 @@ void MeanRawKernel(const Context& dev_ctx,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out, true);
}
template <typename T, typename Context>
......
......@@ -1731,4 +1731,163 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
}
}
template <typename T, typename Context>
void MatmulWithFlattenGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* x_grad,
DenseTensor* y_grad) {
auto x_matrix = x.dims().size() > 2
? paddle::framework::ReshapeToMatrix(x, x_num_col_dims)
: x;
auto y_matrix = y.dims().size() > 2
? paddle::framework::ReshapeToMatrix(y, y_num_col_dims)
: y;
auto* dout = &out_grad;
DenseTensor dout_mat(*dout);
dout_mat.Resize({phi::flatten_to_2d(x.dims(), x_num_col_dims)[0],
phi::flatten_to_2d(y.dims(), y_num_col_dims)[1]});
auto* dx = x_grad;
auto* dy = y_grad;
if (dx != nullptr) {
dx->set_lod(x.lod());
}
if (dy != nullptr) {
dy->set_lod(y.lod());
}
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
if (dx) {
dev_ctx.template Alloc<T>(dx);
DenseTensor dx_matrix =
dx->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
}
if (dy) {
dev_ctx.template Alloc<T>(dy);
DenseTensor dy_matrix =
dy->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
}
}
template <typename T, typename Context>
void MatmulWithFlattenDoubleGradKernel(
const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
paddle::optional<const DenseTensor&> x_grad_grad,
paddle::optional<const DenseTensor&> y_grad_grad,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* x_grad,
DenseTensor* y_grad,
DenseTensor* out_grad_grad) {
auto x_mat = x.dims().size() > 2
? paddle::framework::ReshapeToMatrix(x, x_num_col_dims)
: x;
auto y_mat = y.dims().size() > 2
? paddle::framework::ReshapeToMatrix(y, y_num_col_dims)
: y;
const int m = phi::flatten_to_2d(x.dims(), x_num_col_dims)[0];
const int n = phi::flatten_to_2d(y.dims(), y_num_col_dims)[1];
auto* dout = &out_grad;
DenseTensor dout_mat(*dout);
dout_mat.Resize({m, n});
auto* ddx = x_grad_grad.get_ptr();
auto* ddy = y_grad_grad.get_ptr();
auto* dx = x_grad;
auto* dy = y_grad;
auto* ddout = out_grad_grad;
DenseTensor ddout_mat;
if (ddout) {
ddout->set_lod(dout->lod());
// allocate and reshape ddout
dev_ctx.template Alloc<T>(ddout);
ddout_mat.ShareDataWith(*ddout);
ddout_mat.Resize({m, n});
}
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
// a flag to specify whether ddout value has been set, if flag
// is false, MatMul beta should be 0 to set ddout, if flag is
// true, MatMul beta should be 1 to add result to ddout.
bool ddout_flag = false;
if (ddx) {
auto ddx_mat =
ddx->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*ddx, x_num_col_dims)
: static_cast<const DenseTensor&>(*ddx);
// dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
if (dy) {
dy->set_lod(y.lod());
// allocate and reshape dy
dev_ctx.template Alloc<T>(dy);
DenseTensor dy_mat =
dy->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
}
// ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
if (ddout) {
blas.MatMul(ddx_mat,
false,
y_mat,
false,
static_cast<T>(1.0),
&ddout_mat,
static_cast<T>(ddout_flag));
ddout_flag = true;
}
}
if (ddy) {
auto ddy_mat =
ddy->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*ddy, y_num_col_dims)
: static_cast<const DenseTensor&>(*ddy);
// dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
if (dx) {
dx->set_lod(x.lod());
// allocate and reshape dx
dev_ctx.template Alloc<T>(dx);
DenseTensor dx_mat =
dx->dims().size() > 2
? paddle::framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
}
// ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
if (ddout) {
blas.MatMul(x_mat,
false,
ddy_mat,
false,
static_cast<T>(1.0),
&ddout_mat,
static_cast<T>(ddout_flag));
}
}
}
} // namespace phi
......@@ -506,4 +506,34 @@ void MatmulKernel(const Context& dev_ctx,
MatMulFunction<Context, T>(dev_ctx, x, y, out, transpose_x, transpose_y);
}
template <typename T, typename Context>
void MatmulWithFlattenKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* out) {
const DenseTensor x_matrix =
x.dims().size() > 2
? paddle::framework::ReshapeToMatrix(x, x_num_col_dims)
: x;
const DenseTensor y_matrix =
y.dims().size() > 2
? paddle::framework::ReshapeToMatrix(y, y_num_col_dims)
: y;
dev_ctx.template Alloc<T>(out);
auto z_dim = out->dims();
if (z_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
blas.MatMul(x_matrix, y_matrix, out);
if (z_dim.size() != 2) {
out->Resize(z_dim);
}
}
} // namespace phi
......@@ -60,4 +60,28 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
DenseTensor* out_d_ddx,
DenseTensor* out_d_ddy);
template <typename T, typename Context>
void MatmulWithFlattenGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* x_grad,
DenseTensor* y_grad);
template <typename T, typename Context>
void MatmulWithFlattenDoubleGradKernel(
const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
paddle::optional<const DenseTensor&> x_grad_grad,
paddle::optional<const DenseTensor&> y_grad_grad,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* x_grad,
DenseTensor* y_grad,
DenseTensor* out_grad_grad);
} // namespace phi
......@@ -29,6 +29,16 @@ void MatmulKernel(const Context& dev_ctx,
bool transpose_y,
DenseTensor* out);
// In order to be compatible with `mul` op in fluid,
// it is no longer used in 2.x API
template <typename T, typename Context>
void MatmulWithFlattenKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int x_num_col_dims,
int y_num_col_dims,
DenseTensor* out);
template <typename T, typename Context>
DenseTensor Matmul(const Context& dev_ctx,
const DenseTensor& x,
......
set(SELECTED_ROWS_KERNEL_DEPS dense_tensor selected_rows sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel)
register_kernels(DEPS ${SELECTED_ROWS_KERNEL_DEPS} SUB_DIR "selected_rows_kernel")
......@@ -19,7 +19,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#endif
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h"
#include "paddle/phi/kernels/selected_rows/impl/isfinite_kernel_impl.h"
namespace phi {
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature MulGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("matmul_with_flatten_grad",
{"X", "Y", GradVarName("Out")},
{"x_num_col_dims", "y_num_col_dims"},
{GradVarName("X"), GradVarName("Y")});
}
KernelSignature MulDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("matmul_with_flatten_double_grad",
{"X", "Y", "DOut", "DDX", "DDY"},
{"x_num_col_dims", "y_num_col_dims"},
{"DX", "DY", "DDOut"});
}
} // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(mul, matmul_with_flatten);
PD_REGISTER_BASE_KERNEL_NAME(mul_grad, matmul_with_flatten_grad);
PD_REGISTER_BASE_KERNEL_NAME(mul_grad_grad, matmul_with_flatten_double_grad);
PD_REGISTER_ARG_MAPPING_FN(mul_grad, phi::MulGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(mul_grad_grad, phi::MulDoubleGradOpArgumentMapping);
......@@ -76,7 +76,7 @@ if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_
if not defined LOG_LEVEL set LOG_LEVEL=normal
if not defined PRECISION_TEST set PRECISION_TEST=OFF
if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
if not defined retry_times set retry_times=3
if not defined retry_times set retry_times=1
if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
if not defined BUILD_DIR set BUILD_DIR=build
set task_name=%1
......@@ -234,7 +234,6 @@ set WITH_MKL=OFF
set WITH_GPU=OFF
set WITH_AVX=OFF
set MSVC_STATIC_CRT=ON
set retry_times=1
set ON_INFER=OFF
call :cmake || goto cmake_error
......@@ -267,7 +266,6 @@ rem ------Build windows avx whl package------
set WITH_AVX=ON
set ON_INFER=OFF
set CUDA_ARCH_NAME=All
set retry_times=4
call :cmake || goto cmake_error
call :build || goto build_error
......@@ -279,7 +277,6 @@ rem ------Build windows no-avx whl package------
set WITH_AVX=OFF
set ON_INFER=OFF
set CUDA_ARCH_NAME=All
set retry_times=4
call :cmake || goto cmake_error
call :build || goto build_error
......
......@@ -209,6 +209,9 @@ function cmake_base() {
-DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF}
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF}
-DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF}
-DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF}
-DWITH_PYTHON=${WITH_PYTHON:-ON}
-DCUDNN_ROOT=/usr/
-DWITH_TESTING=${WITH_TESTING:-ON}
......@@ -262,6 +265,9 @@ EOF
-DWITH_AVX=${WITH_AVX:-OFF} \
-DNOAVX_CORE_FILE=${NOAVX_CORE_FILE:-""} \
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} \
-DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} \
-DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} \
-DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \
-DWITH_TESTING=${WITH_TESTING:-ON} \
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from .base_cost import OP_COST_FACTORY
from .base_cost import Cost
from .comm_op_cost import AllreduceSumCost
from .comp_op_cost import MatmulV2OpCost
from .tensor_cost import TensorCost
from .estimate_cost import CostEstimator
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from collections import OrderedDict
import paddle
COMM_OP_TYPE = [
"send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum"
]
NON_COMP_TYPE = ["while"] + COMM_OP_TYPE
OP_COST_FACTORY = {}
def _parse_op_to_desc(op, dist_context=None):
desc = {}
desc["op"] = op.type
vars = op.block.vars
input_desc = OrderedDict()
for input_name in op.input_names:
var_name_list = op.input(input_name)
var_desc = []
for var_name in var_name_list:
var = vars[var_name]
shape = None
if dist_context is not None:
dist_tensor = dist_context.get_dist_tensor_for_program(var)
shape = dist_tensor.local_sizes()
else:
shape = var.shape
assert shape is not None
var_desc.append((var.dtype, shape))
input_desc[input_name] = var_desc
desc["inputs"] = input_desc
output_desc = OrderedDict()
for out_name in op.output_names:
var_name_list = op.output(out_name)
var_desc = []
for var_name in var_name_list:
var = vars[var_name]
shape = None
if dist_context is not None:
dist_tensor = dist_context.get_dist_tensor_for_program(var)
shape = dist_tensor.local_sizes()
else:
shape = var.shape
assert shape is not None
var_desc.append((var.dtype, shape))
output_desc[out_name] = var_desc
desc["outputs"] = output_desc
attr_desc = op.all_attrs
desc["attrs"] = attr_desc
return desc
def parse_to_desc(op=None, dist_op=None, dist_context=None):
desc = None
if op is None and dist_op is not None and dist_context is not None:
desc = _parse_op_to_desc(
op=dist_op.serial_op, dist_context=dist_context)
elif op is not None and dist_op is None and dist_context is None:
desc = _parse_op_to_desc(op)
return desc
def parse_desc_to_str(desc):
def _parse_dtype(dtype):
dtype_str = ""
if dtype == paddle.float32:
dtype_str = "float32"
elif dtype == paddle.float16:
dtype_str = "float16"
elif dtype == paddle.int32:
dtype_str = "int32"
elif dtype == paddle.int64:
dtype_str = "int64"
elif dtype == paddle.unit8:
dtype_str = "unit8"
else:
raise TypeError("Unsupported dtype {}".format(dtype))
return dtype_str
assert isinstance(desc, dict)
desc_str_list = []
desc_str = None
dtype_str_list = []
dims_list = []
shape_list = []
desc_str_list.append(desc["op"])
inputs = desc["inputs"]
for key, item in inputs.items():
for dtype, shape in item:
dtype_str_list.append(_parse_dtype(dtype))
shape_list += list(shape)
dims = len(shape)
dims_list.append(dims)
dtype_str = "*".join(dtype_str_list)
dims_list = [str(item) for item in dims_list]
dims_str = "*".join(dims_list)
shape_list = [str(item) for item in shape_list]
shape_str = "[" + ",".join(shape_list) + "]"
desc_str_list += [dtype_str, dims_str, shape_str]
desc_str = "_".join(desc_str_list)
return desc_str
class CommContext:
_instance = None
_has_instance = False
def __init__(self, cluster):
if CommContext._has_instance:
return
self.cluster = cluster
self._alpha_base_ring = 8.4
self._alpha_base_tree = 0
self._alpha_inter = None
self._alpha_intra
self._beta = {}
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super().__new__(cls, *args, **kwargs)
_has_instance = True
return cls._instance
@property
def alpha_inter(self):
if self._alpha_inter is None:
if cluster.alpha.inter == "NVL":
self._alpha_inter = 3.4
elif cluster.alpha.inter == "PHB":
self._alpha_inter = 5.7
return self._alpha_inter
@property
def alpha_intra(self):
if self._alpha_intra is None:
if cluster.alpha.intra == "NVL":
self._alpha_intra = 28
elif cluster.alpha.intra == "PHB":
self._alpha_intra = 28
return self._alpha_intra
@property
def alpha_base_ring(self):
return self._alpha_base_ring
@property
def alpha_base_tree(self):
return self._alpha_base_tree
def get_beta(self, ranks):
key = ','.join(map(str, sorted(ranks)))
max_beta = None
if key in self._beta.keys:
max_beta = self._beta[key]
else:
for i in range(len(ranks)):
for j in range(i + 1, len(ranks)):
if min_beta == None:
min_beta = cluster.get_beta(ranks[i], ranks[j])
else:
beta = cluster.get_beta(ranks[i], ranks[j])
if beta > max_beta:
max_beta = beta
self._beta[key] = max_beta
return max_beta
class Cost:
def __init__(self, time=0, memory=0, flops=0):
self.time = time
self.memory = memory
self.flops = flops
def _check_time(self, val):
assert val >= 0, "Time must be greater than or equal to 0."
def _check_memory(self, val):
assert isinstance(
val, int) and val >= 0, "Memory must be int and greater than 0."
def _check_flops(self, val):
assert isinstance(
val, int) and val >= 0, "FLOPs must be int and greater than 0."
@property
def time(self):
return self._time
@time.setter
def time(self, val):
self._check_time(val)
self._time = val
@property
def memory(self):
return self._memory
@memory.setter
def memory(self, val):
self._check_memory(val)
self._memory = val
@property
def flops(self):
return self._flops
@flops.setter
def flops(self, val):
self._check_flops(val)
self._flops = val
def __add__(self, rhs):
assert isinstance(rhs, Cost)
time = self.time + rhs.time
memory = self.memory + rhs.memory
flops = self.flops + rhs.flops
assert (time >= 0 and memory >= 0 and flops >= 0)
return Cost(time, memory, flops)
def __sub__(self, rhs):
assert isinstance(rhs, Cost)
time = self.time - rhs.time
memory = self.memory - rhs.memory
flops = self.flops - rhs.flops
assert (time >= 0 and memory >= 0 and flops >= 0)
return Cost(time, memory, flops)
class OpCost:
def __init__(self, op=None, op_desc=None):
assert (op is not None and op_desc is None) or (op is None and
op_desc is not None)
self._op = op
self._op_desc = op_desc
self._cost = self.calc_cost()
@property
def op(self):
return self._op
@property
def op_desc(self):
return self._op_desc
@property
def cost(self):
return self._cost
def calc_time(self):
return 0
def calc_memory(self):
return 0
def calc_flops(self):
return 0
def calc_cost(self):
time = self.calc_time()
memory = self.calc_memory()
flops = self.calc_flops()
cost = Cost(time, memory, flops)
return cost
class CommOpCost(OpCost):
OP_TYPE = "COMM"
def __init__(self, op=None, op_desc=None, comm_context=None):
super(CommOpCost, self).__init__(op=op, op_desc=op_desc)
self._check_comm_op_type()
self._comm_context = comm_context
@property
def comm_context(self):
return self._comm_context
@classmethod
def _check_comm_op_type(cls):
if cls.OP_TYPE != "COMM":
if cls.OP_TYPE not in COMM_OP_TYPE:
raise TypeError("Please Check op type in {}, but got {}.".
format(COMM_OP_TYPE, cls.OP_TYPE))
class CompOpCost(OpCost):
OP_TYPE = "COMP"
def __init__(self, op=None, op_desc=None, cluster=None):
super(CompOpCost, self).__init__(op=op, op_desc=op_desc)
self._check_comp_op_type()
self.cluster = cluster
@classmethod
def _check_comp_op_type(cls):
if cls.OP_TYPE != "COMP":
if cls.OP_TYPE in NON_COMP_TYPE:
raise TypeError("Please Check op type not in {}, but got {}.".
format(NON_COMP_TYPE, cls.OP_TYPE))
def register_op_cost(cls):
op_type = cls.OP_TYPE
def register(op_type):
OP_COST_FACTORY[op_type] = cls
return register(op_type)
def calc_time_from_model(op=None, desc=None, cluster=None, comm_context=None):
op_type = op.type if op is not None else desc["op"]
if op_type in COMM_OP_TYPE:
op_cost = OP_COST_FACTORY[op_type](op=op,
op_desc=desc,
comm_context=comm_context)
elif op_type not in NON_COMP_TYPE:
op_cost = OP_COST_FACTORY[op_type](op=op, op_desc=desc, cluster=cluster)
time = op_cost.calc_time()
return time
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from .base_cost import register_op_cost, CommOpCost, OP_COST_FACTORY
@register_op_cost
class AllreduceSumCost(CommOpCost):
OP_TYPE = "c_allreduce_sum"
def __init__(self, op=None, op_desc=None, comm_context=None):
super(OP_COST_FACTORY["c_allreduce_sum"], self).__init__(
op=op, op_desc=op_desc, comm_context=comm_context)
def calc_time(self):
# NOTE: The actual formula will be filled in the future.
return 0
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from .base_cost import Cost, register_op_cost, CompOpCost, OP_COST_FACTORY
@register_op_cost
class MatmulV2OpCost(CompOpCost):
OP_TYPE = "matmul_v2"
def __init__(self, op=None, op_desc=None, cluster=None):
super(OP_COST_FACTORY["matmul_v2"], self).__init__(
op=op, op_desc=op_desc, cluster=cluster)
# For a concrete COMP OP, the calc_time and calc_flops function needs to be overrided
def calc_flops(self):
# NOTE: The actual formula will be filled in the future
return 0
def calc_time(self):
# NOTE: The actual formula will be filled in the future
return 0
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
class CostEstimator:
def __init__(self,
program,
cluster=None,
dist_context=None,
mode="modeling"):
self._program = program
self._cluster = cluster
self._dist_context = dist_context
self._check_mode(mode)
self._mode = mode
self._global_cost = None
self._local_cost = {}
@property
def program(self):
return self._program
@property
def dist_context(self):
return self._dist_context
@property
def cluster(self):
return self._cluster
@property
def mode(self):
return self._mode
@property
def global_cost(self):
return self._global_cost
@property
def local_cost(self):
return self._local_cost
def get_op_cost(self):
return 0
def get_tensor_cost(self):
return 0
def get_global_cost(self):
return 0
def get_local_cost(self, rank=None):
return 0
def _check_mode(self, mode):
if mode not in ["modeling", "profiling"]:
raise ValueError(
"Just support modeling and profiling, but got {}".format(mode))
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from functools import reduce
import paddle
from paddle.fluid.framework import Variable
from paddle.distributed.auto_parallel.dist_tensor import DistributedTensor
from .base_cost import Cost
class TensorCost:
def __init__(self, tensor=None, dist_tensor=None, shape=None, dtype=None):
self._check_args(tensor, dist_tensor, shape, dtype)
self._tensor = tensor
self._dist_tensor = dist_tensor
self._shape = shape
self._dtype = dtype
self._cost = self.calc_cost()
@property
def tensor(self):
return self._tensor
@property
def dist_tensor(self):
return self._dist_tensor
@property
def shape(self):
return self._shape
@property
def dtype(self):
return self._dtype
def _check_args(self, tensor, dist_tensor, shape, dtype):
if tensor is not None:
assert (shape is None and dist_tensor is None and dtype is None)
if not isinstance(tensor, Variable):
raise TypeError(
"Please check tensor type is Variable, but got {}".format(
type(tensor)))
elif dist_tensor is not None:
assert (tensor is None and shape is None)
if not isinstance(dist_tensor, DistributedTensor):
raise TypeError(
"Please check dist_tensor type is DistributedTensor, but got {}".
format(type(dist_tensor)))
elif shape is not None:
assert (tensor is None and dist_tensor is None and
dtype is not None)
if not isinstance(shape, (list, set)):
raise TypeError(
"Please check shape type is list or set, but got {}".format(
type(shape)))
elif dtype is not None:
assert (tensor is None and dist_tensor is None and
shape is not None)
@property
def cost(self):
return self._cost
def calc_cost(self):
dtype = None
shape = None
if self.dist_tensor:
shape = self.dist_tensor.local_sizes()
dtype = self.dist_tensor.serial_tensor.dtype
elif self.tensor:
shape = self.tensor.shape
dtype = self.tensor.dtype
elif self.shape and self.dtype:
shape = self.shape
dtype = self.dtype
total_count = reduce(lambda x, y: x * y, shape)
if dtype == paddle.float32 or dtype == paddle.int32:
dtype_factor = 4
elif node.dtype == paddle.int64:
dtype_factor = 8
elif node.dtype == paddle.uint8:
dtype_factor = 1
else:
dtype_factor = 2
memory = total_count * dtype_factor
assert memory >= 0
cost = Cost(memory=memory)
return cost
......@@ -242,7 +242,8 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
elastic_group.add_argument(
"--force", type=bool, default=False, help="update np force")
return parser.parse_args()
known_args, _ = parser.parse_known_args()
return known_args
def get_cluster_from_args(args, device_mode, devices_per_proc):
......
......@@ -25,12 +25,13 @@ class Context(object):
def __init__(self, enable_plugin=True):
self.args, self.unknown_args = parse_args()
self.envs = fetch_envs()
self.logger = self.get_logger()
self.set_env_in_args()
self.node = Node()
self.status = Status()
self.set_env_in_args()
self.logger = self.get_logger()
# design for event queue, later
self.events = []
......
......@@ -57,7 +57,7 @@ class Device(object):
else:
self._labels = []
def get_selected_flag_key(self):
def get_selected_device_key(self):
if self._dtype == DeviceType.CPU:
return 'FLAGS_selected_cpus'
if self._dtype == DeviceType.GPU:
......@@ -70,19 +70,15 @@ class Device(object):
return 'FLAGS_selected_mlus'
return 'FLAGS_selected_devices'
def get_selected_flag_label(self, idx):
if idx < len(self._labels):
return self._labels[idx]
def get_selected_devices(self, devices=''):
'''
return the device label/id relative to the visible devices
'''
if not devices:
return [str(x) for x in range(0, len(self._labels))]
else:
return '0'
def selected_flags(self, idx=None):
if idx is None:
return {self.get_selected_flag_key(): ','.join(self._labels)}
else:
return {
self.get_selected_flag_key(): self.get_selected_flag_label(idx)
}
devs = [x.strip() for x in devices.split(',')]
return [str(self._labels.index(d)) for d in devs]
@classmethod
def parse_device(self):
......
......@@ -75,6 +75,9 @@ class CollectiveController(Controller):
job_endpoints = [i['endpoints'] for i in peer_list]
self.pod.reset()
selected_dev_key = self.ctx.node.device.get_selected_device_key()
selected_dev_list = self.ctx.node.device.get_selected_devices(
self.ctx.args.devices)
for i in range(self.pod.replicas):
e = {
"PADDLE_MASTER": collective_master,
......@@ -90,9 +93,9 @@ class CollectiveController(Controller):
"PADDLE_RANK_IN_NODE": str(i),
}
if self.pod.replicas == 1:
e.update(self.ctx.node.device.selected_flags())
e.update({selected_dev_key: selected_dev_list})
else:
e.update(self.ctx.node.device.selected_flags(i))
e.update({selected_dev_key: selected_dev_list[i]})
self.add_container(envs=e, log_tag=i)
return True
......
......@@ -210,6 +210,8 @@ class Controller(ControllerBase):
if self.ctx.args.nproc_per_node:
return int(self.ctx.args.nproc_per_node)
elif self.ctx.args.devices:
return len(self.ctx.args.devices.split(','))
else:
return self.ctx.node.device.count
......
......@@ -29,8 +29,9 @@ def process_args(ctx):
#argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
argdev = ctx.args.devices
if argdev:
ctx.node.device.labels = argdev.split(',')
ctx.logger.debug('Device reset by args {}'.format(argdev))
for d in argdev.split(','):
assert d in ctx.node.device.labels, 'Device not found {}'.format(
argdev)
def collective_compatible(ctx):
......
......@@ -22,6 +22,10 @@ from paddle.fluid.framework import program_guard, device_guard
from paddle.fluid import unique_name, layers
from paddle.fluid.clip import append_gradient_clip_ops
from .pass_base import PassBase, PassType, register_pass
from paddle.distributed.auto_parallel.utils import set_var_dist_attr
from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping
from paddle.distributed.auto_parallel.process_group import get_world_process_group
world_process_group = get_world_process_group()
def _is_the_backward_op(op):
......@@ -68,15 +72,11 @@ def _remove_and_get_optimizer_op(main_program, dist_context):
def _remove_op_role_var(param, grad):
op_maker = core.op_proto_and_checker_maker
op = grad.op
assert _is_the_backward_op(op), \
'grad.op={} is not the backward op which produces the grad={}' \
.format(op, grad.name)
if op.has_attr(op_maker.kOpRoleVarAttrName()):
op._remove_attr(op_maker.kOpRoleVarAttrName())
def _get_gm_cond_var(main_program, k_steps):
def _get_gm_cond_var(main_program, k_steps, dist_context):
main_block = main_program.global_block()
# Add const var
k_step_var = layers.create_global_var(
......@@ -86,6 +86,7 @@ def _get_gm_cond_var(main_program, k_steps):
dtype='int32',
persistable=True,
force_cpu=True)
set_var_dist_attr(dist_context, k_step_var, [-1], world_process_group.ranks)
zero_var = layers.create_global_var(
name="gradient_merge_zero",
......@@ -94,6 +95,7 @@ def _get_gm_cond_var(main_program, k_steps):
dtype='int32',
persistable=True,
force_cpu=True)
set_var_dist_attr(dist_context, zero_var, [-1], world_process_group.ranks)
# Add step var & cond var
step_var = layers.create_global_var(
......@@ -103,6 +105,7 @@ def _get_gm_cond_var(main_program, k_steps):
dtype='int32',
persistable=True,
force_cpu=True)
set_var_dist_attr(dist_context, step_var, [-1], world_process_group.ranks)
cond_var = layers.create_global_var(
name="gradient_merge_cond",
......@@ -111,24 +114,29 @@ def _get_gm_cond_var(main_program, k_steps):
dtype='bool',
persistable=False,
force_cpu=True)
set_var_dist_attr(dist_context, cond_var, [-1], world_process_group.ranks)
with device_guard("cpu"):
# step_var = (step_var + 1) % k_step
layers.increment(x=step_var, value=1.0, in_place=True)
main_block.append_op(
elementwise_mod_op = main_block.append_op(
type='elementwise_mod',
inputs={'X': step_var,
'Y': k_step_var},
outputs={'Out': step_var},
attrs={'axis': -1,
'use_mkldnn': False})
naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
elementwise_mod_op, world_process_group.ranks, [-1], dist_context)
# cond_var = (step_var == 0)
main_block.append_op(
equal_op = main_block.append_op(
type='equal',
inputs={'X': step_var,
'Y': zero_var},
outputs={'Out': cond_var})
naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
equal_op, world_process_group.ranks, [-1], dist_context)
return cond_var
......@@ -137,7 +145,8 @@ def _append_gradient_merge_backward_op(
main_program,
startup_program,
params_grads: List[Tuple[Any, Any]],
cond_var_name: str) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]:
cond_var_name: str,
dist_context) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]:
main_block = main_program.global_block()
startup_block = startup_program.global_block()
......@@ -156,12 +165,19 @@ def _append_gradient_merge_backward_op(
param_name = param.name
param_var = main_block.var(param_name)
assert (param_var is not None)
ref_dist_attr = dist_context.get_tensor_dist_attr_for_program(param_var)
assert ref_dist_attr is not None
gradient_merge_var = main_block.create_var(
name=param_name + "@GRAD@GradientMerge",
shape=param_var.shape,
dtype=param_var.dtype,
persistable=True)
param_to_gradient_merge[param_name] = gradient_merge_var
ref_process_mesh = ref_dist_attr.process_mesh
ref_dims_mapping = ref_dist_attr.dims_mapping
set_var_dist_attr(dist_context, gradient_merge_var, ref_dims_mapping,
ref_process_mesh)
startup_gradient_merge_var = startup_block.create_var(
name=param_name + "@GRAD@GradientMerge",
......@@ -186,6 +202,8 @@ def _append_gradient_merge_backward_op(
attrs={'axis': -1,
'use_mkldnn': False})
new_params_to_grads.append([param, gradient_merge_var])
naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
new_grad_op, ref_process_mesh, ref_dims_mapping, dist_context)
return new_params_to_grads, param_to_gradient_merge
......@@ -240,7 +258,7 @@ def _create_cond_block_and_update_optimizer(
new_op_desc.remove_attr(op_maker.kOpRoleVarAttrName())
# op's update Grad
if new_op_desc.input("Grad"):
if core.grad_var_suffix() in new_op_desc.input_arg_names():
grad_value = new_op_desc.input("Grad")[0]
# TODO FIXME(xym) support fp16
grad_merge_value = grad_value + '@GradientMerge'
......@@ -265,7 +283,7 @@ def _create_cond_block_and_update_optimizer(
def parse_program(main_program, startup_program, params_grads, k_steps, avg,
dist_context):
# 1 create gradient_merge_cond
cond_var = _get_gm_cond_var(main_program, k_steps)
cond_var = _get_gm_cond_var(main_program, k_steps, dist_context)
# 2 remove optimizer_op from main_program
optimize_ops_desc = _remove_and_get_optimizer_op(main_program, dist_context)
......@@ -275,7 +293,8 @@ def parse_program(main_program, startup_program, params_grads, k_steps, avg,
# 3 append gradient merge backward op to main_program
new_params_to_grads, param_to_gradient_merge = _append_gradient_merge_backward_op(
main_program, startup_program, params_grads, cond_var.name)
main_program, startup_program, params_grads, cond_var.name,
dist_context)
# 4 create ConditionalBlock and append gradient merge optimizer ops
_create_cond_block_and_update_optimizer(
......
......@@ -97,7 +97,9 @@ class Communicator(object):
recv_ctx,
proto_txt,
unit64_hosts,
scope=global_scope()):
scope=None):
if scope == None:
scope = global_scope()
self.communicator_ = core.DistCommunicator(self.mode, proto_txt,
unit64_hosts, send_ctx,
recv_ctx, scope, self.envs)
......@@ -191,7 +193,9 @@ class Communicator(object):
def pull_dense(self, context):
self.communicator_.pull_dense(context)
def push_sparse_param(self, var_name, table_id=-1, scope=global_scope()):
def push_sparse_param(self, var_name, table_id=-1, scope=None):
if scope == None:
scope = global_scope()
if not self.is_running():
raise ValueError(
"Communicator should init first. Using fleet.init_worker() before push_sparse_param()"
......
......@@ -105,9 +105,8 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''):
if not isinstance(expected_type, tuple):
expected_type = (expected_type, )
expected_type += (core.VarBase, )
# TODO(jiabin): uncomment it when we support declarative mode in eager
# if _in_eager_mode():
# expected_type += (core.eager.Tensor, )
if core._in_eager_mode():
expected_type += (core.eager.Tensor, )
elif isinstance(input, core.VarBase):
raise TypeError(
"Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. "
......
......@@ -17,4 +17,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS})
py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS})
py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS})
py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS})
endif()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import paddle.distributed.auto_parallel.cost as cost_model
from paddle.distributed.auto_parallel.cost.base_cost import parse_to_desc
from paddle.distributed.auto_parallel.cost.base_cost import parse_desc_to_str
from paddle.distributed.auto_parallel.cost.base_cost import calc_time_from_model
paddle.enable_static()
def check_cost(cost):
if cost.memory >= 0 and cost.flops >= 0 and cost.time >= 0:
return True
return False
class TestCost(unittest.TestCase):
def test_base_cost(self):
cost = cost_model.Cost(memory=100, flops=200, time=0.5)
self.assertTrue(check_cost(cost))
def test_comp_cost(self):
x = paddle.static.data(name="x", shape=[20, 20], dtype='float32')
y = paddle.static.data(name="y", shape=[20, 20], dtype='float32')
z = paddle.matmul(x, y)
matmul_v2_op = None
ops = paddle.static.default_main_program().global_block().ops
for op in ops:
if op.type == "matmul_v2":
matmul_v2_op = op
break
matmul_v2_cost = cost_model.OP_COST_FACTORY["matmul_v2"](
op=matmul_v2_op)
desc = parse_to_desc(op=matmul_v2_op)
desc_str = parse_desc_to_str(desc)
self.assertIsNotNone(desc_str)
self.assertTrue(check_cost(matmul_v2_cost.cost))
time = calc_time_from_model(op=matmul_v2_op)
self.assertEqual(time, matmul_v2_cost.cost.time)
tensor_cost = cost_model.TensorCost(tensor=x)
# check memory
self.assertEqual(tensor_cost.cost.memory, 1600)
def test_comm_cost(self):
desc = {}
desc["op"] = "c_allreduce_sum"
desc["inputs"] = {"X": [([100, 200], paddle.float32)]}
allreduce_cost = cost_model.OP_COST_FACTORY["c_allreduce_sum"](
op_desc=desc)
self.assertTrue(check_cost(allreduce_cost.cost))
def test_cost_estimator(self):
train_program = paddle.static.Program()
cost_estimator = cost_model.CostEstimator(train_program)
self.assertIsNotNone(cost_estimator)
if __name__ == "__main__":
unittest.main()
......@@ -31,6 +31,7 @@ from paddle.fluid.initializer import NumpyArrayInitializer
from paddle.distributed.passes import new_pass, PassManager, PassContext
import paddle.distributed.fleet as fleet
from dist_pass_test_base import DistPassTestBase
from paddle.distributed.auto_parallel.dist_context import DistributedContext
logging.getLogger().setLevel(logging.INFO)
paddle.enable_static()
......@@ -111,14 +112,20 @@ class TestGradientMergePass(DistPassTestBase):
def init(self):
self._params_grads = None
self._config = {"k_steps": 4, "avg": True}
#self._config["dist_context"] = DistributedContext()
def apply_passes(self, main_prog, startup_prog):
self._config["params_grads"] = self._params_grads
pass_context = PassContext()
auto_parallel_gradient_merge_pass = new_pass(
"auto_parallel_gradient_merge_pass", self._config)
auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog],
pass_context)
#self._config["params_grads"] = self._params_grads
#pass_context = PassContext()
#auto_parallel_gradient_merge_pass = new_pass(
# "auto_parallel_gradient_merge_pass", self._config)
#auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog],
# pass_context)
dist_strategy = fleet.DistributedStrategy()
dist_strategy.gradient_merge = True
dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
dist_strategy.semi_auto = True
fleet.init(is_collective=True, strategy=dist_strategy)
def test_result(self):
no_pass_rets = self._distributed_launch(
......@@ -135,7 +142,7 @@ class TestGradientMergePass(DistPassTestBase):
gradient_merge=True,
batch_size=8,
max_step=8)
"""
# avg loss for gradient_merge pass
avg_loss = 0
pass_avg_ret_list = []
......@@ -156,6 +163,7 @@ class TestGradientMergePass(DistPassTestBase):
rtol=self.rtol,
atol=self.atol,
equal_nan=self.equal_nan))
"""
def get_model(self, place, gradient_merge, batch_size, max_step):
paddle.seed(2021)
......
......@@ -20,6 +20,7 @@ import unittest
import paddle
from paddle.fluid.dygraph.jit import declarative
from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
import paddle.fluid.core as core
from ifelse_simple_func import *
......@@ -379,7 +380,7 @@ class TestDy2StIfElseRetInt1(unittest.TestCase):
return out
def test_ast_to_func(self):
self.assertIsInstance(self.out[0], paddle.Tensor)
self.assertIsInstance(self.out[0], (paddle.Tensor, core.eager.Tensor))
self.assertIsInstance(self.out[1], int)
......@@ -390,8 +391,8 @@ class TestDy2StIfElseRetInt2(TestDy2StIfElseRetInt1):
self.out = self.get_dy2stat_out()
def test_ast_to_func(self):
self.assertIsInstance(self.out[0], paddle.Tensor)
self.assertIsInstance(self.out[1], paddle.Tensor)
self.assertIsInstance(self.out[0], (paddle.Tensor, core.eager.Tensor))
self.assertIsInstance(self.out[1], (paddle.Tensor, core.eager.Tensor))
class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1):
......@@ -401,7 +402,7 @@ class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1):
self.out = self.get_dy2stat_out()
def test_ast_to_func(self):
self.assertIsInstance(self.out, paddle.Tensor)
self.assertIsInstance(self.out, (paddle.Tensor, core.eager.Tensor))
class TestDy2StIfElseRetInt4(TestDy2StIfElseRetInt1):
......
......@@ -118,7 +118,8 @@ class TestWithNestedOutput(unittest.TestCase):
self.assertTrue(len(dygraph_res) == len(static_res))
for dy_var, st_var in zip(dygraph_res, static_res):
if isinstance(dy_var, fluid.core.VarBase):
if isinstance(dy_var,
(fluid.core.VarBase, fluid.core.eager.Tensor)):
self.assertTrue(np.allclose(dy_var.numpy(), st_var.numpy()))
else:
self.assertTrue(dy_var, st_var)
......
......@@ -218,7 +218,7 @@ class TestReturnBase(unittest.TestCase):
res = self.dygraph_func(self.input)
if isinstance(res, (tuple, list)):
return tuple(r.numpy() for r in res)
elif isinstance(res, core.VarBase):
elif isinstance(res, (core.VarBase, core.eager.Tensor)):
return res.numpy()
return res
......
......@@ -713,44 +713,76 @@ class OpTest(unittest.TestCase):
def is_empty(a):
return isinstance(a, Empty)
def get_default(idx, all_params_number, defaults):
related_idx = idx - all_params_number + len(defaults)
assert related_idx >= 0, "%d-th arguments don't have default value" % idx
return defaults[related_idx]
def filter_by_name(x):
names = set(['name', 'out', 'output'])
if isinstance(x, list): return [i for i in x if i not in names]
if isinstance(x, dict):
return {k: v for k, v in x.items() if k not in names}
assert False, "Only support list or dict."
def get_default(idx, defaults):
assert not isinstance(
defaults[idx], Empty
), "%d-th params of python api don't have default value." % idx
return defaults[idx]
def to_defaults_list(params, defaults):
return [defaults[p] for p in params if p in defaults]
# NOTE(xiongkun): why don't use input arguments dicts ?
# Because we don't know the python api name of each arguments.
# using parse_arg_and_kwargs, we can get the all api information we need.
api_params, api_defaults = [
filter_by_name(item) for item in parse_arg_and_kwargs(api)
]
def parse_attri_value(name, op_inputs, op_attrs):
""" parse true value from inputs and attrs, if there is no name passed by OpTest, return Empty
1. if the name in op_attrs, use the op_attrs[name]
2. if the name in op_inputs, convert the op_inputs to [type of default value]
3. if the name not in op_attrs ans op_inputs, return Empty. (this will use the default value from python api)
"""
if name in op_proto_attrs:
return op_proto_attrs[name]
elif name in op_inputs:
assert op_inputs[name].__len__(
) == 1, "currently don't support multi-input in attribute."
# why don't use numpy().item() : if the Tensor is float64, we will change it to python.float32, where we loss accuracy: [allclose_op]
# why we reconstruct a tensor: because we want the tensor in cpu.
return paddle.to_tensor(
op_inputs[name][0].numpy(), place='cpu')
else:
return Empty()
# NOTE(xiongkun): the logic of constructing parameters:
# for example:
# python api: cumprod(x, dim, dtype=None, name=None)
# kernel sig: [["x"], ["dim"], ["out"]]"
#
# we will construct a lot of list with the same length : len == len(api_params), here is 4
# api_params = ["x", "dim", "dtype", "name"]
# api_defaults = [Empty, Empty, None, None]; empty means no defaults.
# inputs_and_attrs = ["x", "dim"] , the length may shorter or longer than api_params
# input_arguments = [RealValue in self.inputs and self.attrs]
# then ,we will loop for the api_params, construct a result list:
# if the name in ['name', 'dtype', 'out', 'output'], we will use the default value
# else, we will consume a input_arguments. (because the name is not corresponding, so we only use the order)
api_params, api_defaults = parse_arg_and_kwargs(api)
api_defaults = to_defaults_list(api_params, api_defaults)
api_defaults = [
Empty() for i in range(len(api_params) - len(api_defaults))
] + api_defaults
assert len(api_defaults) == len(
api_params), "Error happens. contack xiongkun03 to solve."
inputs_sig, attrs_sig, outputs_sig = kernel_sig
inputs_and_attrs = inputs_sig + attrs_sig
assert (
len(api_params) == len(inputs_and_attrs)
), "inputs and attrs length must equals to python api length. (May be output is in argument list?)"
input_arguments = [op_proto_ins[name] for name in inputs_sig] + [
op_proto_attrs[name] if name in op_proto_attrs else Empty()
parse_attri_value(name, op_proto_ins, op_proto_attrs)
for name in attrs_sig
]
results = []
for idx, arg in enumerate(input_arguments):
if is_empty(arg):
results.append(
get_default(idx, len(input_arguments), api_defaults))
api_ignore_param_list = set(['name', 'dtype', 'out', 'output'])
idx_of_op_proto_arguments = 0
for idx, arg_name in enumerate(api_params):
if arg_name in api_ignore_param_list:
results.append(get_default(idx, api_defaults))
else:
assert idx_of_op_proto_arguments < len(
input_arguments), "Assert False."
tmp = input_arguments[idx_of_op_proto_arguments]
idx_of_op_proto_arguments += 1
if isinstance(tmp, Empty):
results.append(get_default(idx, api_defaults))
else:
results.append(arg)
results.append(tmp)
assert len(results) == len(api_params)
return results
def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
......
......@@ -251,6 +251,9 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
self.assertTrue(np.array_equal(egr_tensor12.numpy(), x))
egr_tensor13 = paddle.randn([2, 2])
self.assertTrue("eager_tmp" in egr_tensor13.name)
with self.assertRaisesRegexp(
ValueError, "The shape of Parameter should not be None"):
eager_param = EagerParamBase(shape=None, dtype="float32")
......
......@@ -64,7 +64,10 @@ class Collective_Test(unittest.TestCase):
if args:
cmd.extend(args.split(" "))
cmd.extend([pyname])
proc = subprocess.Popen(cmd, env)
env = os.environ.copy()
# virtual devies for testing
env.update({'CUDA_VISIBLE_DEVICES': '0,1,2,3,4,5,6,7'})
proc = subprocess.Popen(cmd, env=env)
return proc
def test_collective_1(self):
......
......@@ -17,25 +17,53 @@ import unittest
import numpy as np
import paddle
from paddle import _C_ops
from paddle.fluid import core
from paddle.fluid.framework import _test_eager_guard
class TestSparseUtils(unittest.TestCase):
def test_create_sparse_coo_tensor(self):
with _test_eager_guard():
non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
non_zero_elements = [1, 2, 3, 4, 5]
dense_shape = [3, 4]
dense_indices = paddle.to_tensor(non_zero_indices)
dense_elements = paddle.to_tensor(
non_zero_elements, dtype='float32')
stop_gradient = False
coo = core.eager.sparse_coo_tensor(dense_indices, dense_elements,
dense_shape, stop_gradient)
print(coo)
def test_create_sparse_csr_tensor(self):
with _test_eager_guard():
non_zero_crows = [0, 2, 3, 5]
non_zero_cols = [1, 3, 2, 0, 1]
non_zero_elements = [1, 2, 3, 4, 5]
dense_shape = [3, 4]
dense_crows = paddle.to_tensor(non_zero_crows)
dense_cols = paddle.to_tensor(non_zero_cols)
dense_elements = paddle.to_tensor(
non_zero_elements, dtype='float32')
stop_gradient = False
csr = core.eager.sparse_csr_tensor(dense_crows, dense_cols,
dense_elements, dense_shape,
stop_gradient)
print(csr)
def test_to_sparse_coo(self):
with _test_eager_guard():
x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
non_zero_elements = [1, 2, 3, 4, 5]
dense_x = paddle.to_tensor(x)
#TODO(zhangkaihuo): change to test the corresponding API
out = _C_ops.final_state_to_sparse_coo(dense_x, 2)
print(out)
out = dense_x.to_sparse_coo(2)
assert np.array_equal(out.non_zero_indices().numpy(),
non_zero_indices)
assert np.array_equal(out.non_zero_elements().numpy(),
non_zero_elements)
dense_tensor = _C_ops.final_state_to_dense(out)
dense_tensor = out.to_dense()
assert np.array_equal(dense_tensor.numpy(), x)
def test_to_sparse_csr(self):
......@@ -45,14 +73,14 @@ class TestSparseUtils(unittest.TestCase):
non_zero_cols = [1, 3, 2, 0, 1]
non_zero_elements = [1, 2, 3, 4, 5]
dense_x = paddle.to_tensor(x)
out = _C_ops.final_state_to_sparse_csr(dense_x)
out = dense_x.to_sparse_csr()
print(out)
assert np.array_equal(out.non_zero_crows().numpy(), non_zero_crows)
assert np.array_equal(out.non_zero_cols().numpy(), non_zero_cols)
assert np.array_equal(out.non_zero_elements().numpy(),
non_zero_elements)
dense_tensor = _C_ops.final_state_to_dense(out)
dense_tensor = out.to_dense()
assert np.array_equal(dense_tensor.numpy(), x)
......
......@@ -307,6 +307,7 @@ packages=['paddle',
'paddle.distributed.auto_parallel',
'paddle.distributed.auto_parallel.operators',
'paddle.distributed.auto_parallel.tuner',
'paddle.distributed.auto_parallel.cost',
'paddle.distributed.passes',
'paddle.framework',
'paddle.jit',
......
......@@ -22,7 +22,9 @@ attr_type_converter = {
"i": 'SI32Attr',
"b": 'BoolAttr',
"l": 'SI64Attr',
"f": 'F32Attr'
"f": 'F32Attr',
"NSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE": 'StrAttr',
"St6vectorIiSaIiEE": 'I32ArrayAttr'
}
target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
......
......@@ -38,35 +38,36 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
--wrapped_infermeta_header_path ${temp_path}/generate.h \
--wrapped_infermeta_source_path ${temp_path}/generate.cc
grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc \
find ${PADDLE_ROOT}/paddle/phi/ -name "*.cc" | xargs grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc \
| awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
#step 3:get ir's attr_name.
ir_attr_name_info_file=`mktemp`
# phi_cpu attr
all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
for ir in $all_ir_name
do
attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \
attr_name=`grep "<\"$ir" -A 3 ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \
| awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
gsub(/I32ArrayAttr/,"");gsub(/SI32ArrayAttr/,""); \
gsub(/Attr/,"");gsub(/\)/,""); \
gsub(/[,:]/,"");print $a}'`
echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file
done
# phi_gpu attr
all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
for ir in $all_ir_name
do
attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \
attr_name=`grep "<\"$ir" -A 3 ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \
| awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
gsub(/Attr/,"");gsub(/\)/,""); \
gsub(/I32ArrayAttr/,"");gsub(/SI32ArrayAttr/,""); \
gsub(/Attr/,"");gsub(/\)/,"") \
gsub(/[,:]/,"");print $a}'`
echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file
done
......
......@@ -58,7 +58,7 @@ def get_api_yaml_info(file_path):
def get_kernel_info(file_path):
f = open(file_path, "r")
cont = f.readlines()
return [l.strip() for l in cont]
return [l.strip() for l in cont if l.strip() != ""]
def get_attr_info(file_path):
......@@ -91,10 +91,9 @@ def merge(infer_meta_data, kernel_data, wrap_data):
full_kernel_data = []
for l in kernel_data:
key = l.split()[0]
if key in meta_map:
if key in meta_map:
if key in wrap_map:
full_kernel_data.append((l + " " + wrap_map[key]).split())
else:
elif key in meta_map:
full_kernel_data.append((l + " " + meta_map[key]).split())
else:
full_kernel_data.append((l + " unknown").split())
......@@ -246,15 +245,10 @@ def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]):
registry->AddKernelWithAttrs("{ir_name}","""
res += f"""
std::bind(&KernelLauncherFunc<decltype({kernel_func}),
&KernelLauncherFunc<decltype({kernel_func}),
{kernel_func},
decltype({infer_shape_func}),
{infer_shape_func}>,
KernelLauncher<decltype({kernel_func}),
{kernel_func},
decltype({infer_shape_func}),
{infer_shape_func}>(),
std::placeholders::_1),
{{{attr_names}}});
"""
......@@ -263,15 +257,10 @@ registry->AddKernelWithAttrs("{ir_name}","""
registry->AddKernel("{ir_name}","""
res += f"""
std::bind(&KernelLauncherFunc<decltype({kernel_func}),
{kernel_func},
decltype({infer_shape_func}),
{infer_shape_func}>,
KernelLauncher<decltype({kernel_func}),
&KernelLauncherFunc<decltype({kernel_func}),
{kernel_func},
decltype({infer_shape_func}),
{infer_shape_func}>(),
std::placeholders::_1));
{infer_shape_func}>);
"""
return res
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import json
skip_list = []
def remove_grad_kernel(kernels):
clean_kernels = []
for kernel_ in kernels:
if (not "_grad" in kernel_):
clean_kernels.append(kernel_)
return clean_kernels
CPU_KERNEL_REGISTER = "REGISTER_OP_CPU_KERNEL("
GPU_KERNEL_REGISTER = "REGISTER_OP_CUDA_KERNEL("
XPU_KERNEL_REGISTER = "REGISTER_OP_XPU_KERNEL("
def get_compat_kernels_info(register):
kernels_info = {}
kernel_names = []
for dirpath, dirnames, filenames in os.walk("../../paddle/fluid/operators"):
for file_name in filenames:
if not ".cc" in file_name:
continue
with open(os.path.join(dirpath, file_name)) as f:
txt = f.readlines()
content = ""
registry = False
is_macro_defination = False
for line in txt:
if line.strip().startswith("#define") and line.strip(
).endswith("\\"):
is_macro_defination = True
continue
if is_macro_defination:
if not line.strip().endswith("\\"):
is_macro_defination = False
continue
if (register in line):
content = ""
registry = True
if (registry):
content += line
if (registry and ";" in line):
kernel_name = content.replace("\n", "").replace(
" ", "").strip(register).split(",")
registry = False
kernel_names.append(kernel_name[0])
return remove_grad_kernel(kernel_names)
def show_kernel_statistics(backend, kernels):
print("=== kernels statistics === ")
print("the number of " + backend + " kernels is: " + str(len(kernels)) +
"\n")
print(kernels)
print("\n")
def show_pass_statistics(backend, passes):
print("=== Passes Statistics === ")
print("The number of " + backend + " passes is: " + str(len(passes)) + "\n")
print(passes)
print("\n")
def get_passes_info(register):
pass_registry_func = ""
with open("../../paddle/fluid/inference/api/paddle_pass_builder.cc") as f:
txt = f.readlines()
stack = []
registry_fun_found = False
for line in txt:
if line.strip().startswith("//"):
continue
if register in line:
registry_fun_found = True
if (registry_fun_found):
pass_registry_func += line
if registry_fun_found:
for char in line:
if char == "{":
stack.append(char)
if char == "}":
stack.pop()
if len(stack) == 0:
registry_fun_found = False
pass_list = re.findall("\"(.+?)_pass\"", pass_registry_func)
return pass_list
if __name__ == "__main__":
cpu_kernels = get_compat_kernels_info(CPU_KERNEL_REGISTER)
gpu_kernels = get_compat_kernels_info(GPU_KERNEL_REGISTER)
xpu_kernels = get_compat_kernels_info(XPU_KERNEL_REGISTER)
show_kernel_statistics("CPU", cpu_kernels)
show_kernel_statistics("GPU", gpu_kernels)
show_kernel_statistics("XPU", xpu_kernels)
cpu_passes = get_passes_info("CpuPassStrategy::CpuPassStrategy()")
gpu_passes = get_passes_info("GpuPassStrategy::GpuPassStrategy()")
show_pass_statistics("CPU", cpu_passes)
show_pass_statistics("GPU", gpu_passes)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册