diff --git a/cmake/phi.cmake b/cmake/phi.cmake index 1c4dd723b9b71ffaab33599eb13ee8235393a097..f1241aaa66bb8e5494a51dbe87ecfa2d5a5fd8bc 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -100,7 +100,6 @@ function(kernel_library TARGET) set(xpu_srcs) set(gpudnn_srcs) set(kps_srcs) - set(selected_rows_srcs) # parse and save the deps kerenl targets set(all_srcs) set(kernel_deps) @@ -111,6 +110,12 @@ function(kernel_library TARGET) cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + # used for cc_library selected_rows dir target + set(target_suffix "") + if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows_kernel") + set(target_suffix "_sr") + endif() list(LENGTH kernel_library_SRCS kernel_library_SRCS_len) # one kernel only match one impl file in each backend @@ -121,9 +126,6 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP) list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc) endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc) - list(APPEND selected_rows_srcs ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc) - endif() if (WITH_GPU OR WITH_ROCM) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu) @@ -169,26 +171,46 @@ function(kernel_library TARGET) list(APPEND all_srcs ${xpu_srcs}) list(APPEND all_srcs ${gpudnn_srcs}) list(APPEND all_srcs ${kps_srcs}) + + set(all_include_kernels) + set(all_kernel_name) + foreach(src ${all_srcs}) file(READ ${src} target_content) + # "kernels/xxx"(DenseTensor Kernel) can only include each other, but can't include "SUB_DIR/xxx" (such as selected_rows Kernel) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) - if ("${kernel_library_SUB_DIR}" STREQUAL "") - string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) - else() + list(APPEND all_include_kernels ${include_kernels}) + + # "SUB_DIR/xxx" can include "kernels/xx" and "SUB_DIR/xxx" + if (NOT "${kernel_library_SUB_DIR}" STREQUAL "") string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) + list(APPEND all_include_kernels ${include_kernels}) endif() - foreach(include_kernel ${include_kernels}) + + foreach(include_kernel ${all_include_kernels}) if ("${kernel_library_SUB_DIR}" STREQUAL "") string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) + string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) + list(APPEND all_kernel_name ${kernel_name}) else() - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) + # NOTE(dev): we should firstly match kernel_library_SUB_DIR. + if (${include_kernel} MATCHES "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/") + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) + # for selected_rows directory, add ${target_suffix}. + string(REGEX REPLACE ".h\"" "${target_suffix}" kernel_name ${kernel_name}) + list(APPEND all_kernel_name ${kernel_name}) + else() + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) + string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) + list(APPEND all_kernel_name ${kernel_name}) + endif() + message(STATUS "${TARGET} DEPS ${all_kernel_name}") endif() - string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) - list(APPEND kernel_deps ${kernel_name}) + list(APPEND kernel_deps ${all_kernel_name}) endforeach() endforeach() list(REMOVE_DUPLICATES kernel_deps) - list(REMOVE_ITEM kernel_deps ${TARGET}) + list(REMOVE_ITEM kernel_deps ${TARGET}${target_suffix}) list(LENGTH common_srcs common_srcs_len) list(LENGTH cpu_srcs cpu_srcs_len) @@ -196,92 +218,73 @@ function(kernel_library TARGET) list(LENGTH xpu_srcs xpu_srcs_len) list(LENGTH gpudnn_srcs gpudnn_srcs_len) list(LENGTH kps_srcs kps_srcs_len) - list(LENGTH selected_rows_srcs selected_rows_srcs_len) # kernel source file level # level 1: base device kernel # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs # level 2: device-independent kernel # - common_srcs - # level 3: Kernel implemented by reusing device-independent kernel - # - selected_rows_srcs set(base_device_kernels) set(device_independent_kernel) - set(high_level_kernels) # 1. Base device kernel compile if (${cpu_srcs_len} GREATER 0) - cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - list(APPEND base_device_kernels ${TARGET}_cpu) + cc_library(${TARGET}_cpu${target_suffix} SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_cpu${target_suffix}) endif() if (${gpu_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - list(APPEND base_device_kernels ${TARGET}_gpu) + list(APPEND base_device_kernels ${TARGET}_gpu${target_suffix}) endif() if (${xpu_srcs_len} GREATER 0) - cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - list(APPEND base_device_kernels ${TARGET}_xpu) + cc_library(${TARGET}_xpu${target_suffix} SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_xpu${target_suffix}) endif() if (${gpudnn_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - list(APPEND base_device_kernels ${TARGET}_gpudnn) + list(APPEND base_device_kernels ${TARGET}_gpudnn${target_suffix}) endif() if (${kps_srcs_len} GREATER 0) # only when WITH_XPU_KP, the kps_srcs_len can be > 0 - xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - list(APPEND base_device_kernels ${TARGET}_kps) + xpu_library(${TARGET}_kps${target_suffix} SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_kps${target_suffix}) endif() # 2. Device-independent kernel compile if (${common_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) + nv_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) + hip_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) + xpu_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) else() - cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) + cc_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) endif() - list(APPEND device_independent_kernel ${TARGET}_common) + list(APPEND device_independent_kernel ${TARGET}_common${target_suffix}) endif() - # 3. Reusing kernel compile - if (${selected_rows_srcs_len} GREATER 0) - if (WITH_GPU) - nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) - elseif (WITH_ROCM) - hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) - elseif (WITH_XPU_KP) - xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) - else() - cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) - endif() - list(APPEND high_level_kernels ${TARGET}_sr) - endif() - # 4. Unify target compile + # 3. Unify target compile list(LENGTH base_device_kernels base_device_kernels_len) list(LENGTH device_independent_kernel device_independent_kernel_len) - list(LENGTH high_level_kernels high_level_kernels_len) - if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR - ${high_level_kernels_len} GREATER 0) + if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) + nv_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_ROCM) - hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) + hip_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) + xpu_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) else() - cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) + cc_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) endif() else() set(target_build_flag 0) @@ -290,10 +293,10 @@ function(kernel_library TARGET) if (${target_build_flag} EQUAL 1) if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR - ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) + ${gpudnn_srcs_len} GREATER 0) # append target into PHI_KERNELS property get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) - set(phi_kernels ${phi_kernels} ${TARGET}) + set(phi_kernels ${phi_kernels} ${TARGET}${target_suffix}) set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels}) endif() @@ -318,9 +321,6 @@ function(kernel_library TARGET) if (${kps_srcs_len} GREATER 0) kernel_declare(${kps_srcs}) endif() - if (${selected_rows_srcs_len} GREATER 0) - kernel_declare(${selected_rows_srcs}) - endif() endif() endfunction() diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto index fac30e26c388c65af13135699a886a3c69031d57..9bfa2c05efa67df894ad4e4fa00b095391ee81ca 100644 --- a/paddle/fluid/distributed/ps.proto +++ b/paddle/fluid/distributed/ps.proto @@ -219,13 +219,13 @@ message GraphParameter { optional string gpups_graph_sample_class = 3 [ default = "CompleteGraphSampler" ]; optional string gpups_graph_sample_args = 4 [ default = "" ]; - optional bool use_cache = 5 [ default = true ]; - optional float cache_ratio = 6 [ default = 0.3 ]; + optional bool use_cache = 5 [ default = false ]; + optional int32 cache_size_limit = 6 [ default = 100000 ]; optional int32 cache_ttl = 7 [ default = 5 ]; optional GraphFeature graph_feature = 8; optional string table_name = 9 [ default = "" ]; optional string table_type = 10 [ default = "" ]; - optional int32 gpups_mode_shard_num = 11 [ default = 127 ]; + optional int32 shard_num = 11 [ default = 127 ]; optional int32 gpu_num = 12 [ default = 1 ]; } diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 2c07bd65d63d408b1bff12eda7bcf8fba3336db6..b326870a3a7b0e7d28417b8d18d002e195fa9d54 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -138,7 +138,6 @@ int BasicBfsGraphSampler::run_graph_sampling() { int init_size = 0; //__sync_fetch_and_add std::function bfs = [&, this](int i, int id) -> int { - VLOG(0) << "in bfs " << i << " " << id; if (this->status == GraphSamplerStatus::terminating) { int task_left = __sync_sub_and_fetch(&task_size, 1); if (task_left == 0) { @@ -148,13 +147,13 @@ int BasicBfsGraphSampler::run_graph_sampling() { } size_t ind = i % this->graph_table->task_pool_size_; if (nodes_left[i] > 0) { - nodes_left[i]--; auto iter = sample_neighbors_map[ind].find(id); if (iter == sample_neighbors_map[ind].end()) { - sample_neighbors_map[ind][id] = std::vector(); - iter = sample_neighbors_map[ind].find(id); Node *node = graph_table->shards[i]->find_node(id); if (node != NULL) { + nodes_left[i]--; + sample_neighbors_map[ind][id] = std::vector(); + iter = sample_neighbors_map[ind].find(id); size_t edge_fetch_size = std::min((size_t) this->edge_num_for_each_node, node->get_neighbor_size()); @@ -179,11 +178,14 @@ int BasicBfsGraphSampler::run_graph_sampling() { for (size_t i = 0; i < graph_table->shards.size(); ++i) { std::vector &v = graph_table->shards[i]->get_bucket(); if (v.size() > 0) { - init_size++; - __sync_add_and_fetch(&task_size, 1); - int64_t id = v[0]->get_id(); - graph_table->_shards_task_pool[i % graph_table->task_pool_size_] - ->enqueue(bfs, i, id); + int search_size = std::min(init_search_size, (int)v.size()); + for (int k = 0; k < search_size; k++) { + init_size++; + __sync_add_and_fetch(&task_size, 1); + int64_t id = v[k]->get_id(); + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue(bfs, i, id); + } } // if } if (init_size == 0) { @@ -301,10 +303,11 @@ void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table, std::vector args) { this->gpu_num = gpu_num; this->graph_table = graph_table; - node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10; - edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10; - rounds = args.size() > 2 ? std::stoi(args[2]) : 1; - interval = args.size() > 3 ? std::stoi(args[3]) : 60; + init_search_size = args.size() > 0 ? std::stoi(args[0]) : 10; + node_num_for_each_shard = args.size() > 1 ? std::stoi(args[1]) : 10; + edge_num_for_each_node = args.size() > 2 ? std::stoi(args[2]) : 10; + rounds = args.size() > 3 ? std::stoi(args[3]) : 1; + interval = args.size() > 4 ? std::stoi(args[4]) : 60; } #endif @@ -1092,11 +1095,6 @@ int32_t GraphTable::initialize(const GraphParameter &graph) { #ifdef PADDLE_WITH_HETERPS if (graph.gpups_mode()) { gpups_mode = true; - if (shard_num == 0) { - shard_num = graph.gpups_mode_shard_num(); - server_num = 1; - _shard_idx = 0; - } auto *sampler = CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class()); auto slices = @@ -1107,7 +1105,18 @@ int32_t GraphTable::initialize(const GraphParameter &graph) { graph_sampler.reset(sampler); } #endif + if (shard_num == 0) { + server_num = 1; + _shard_idx = 0; + shard_num = graph.shard_num(); + } task_pool_size_ = graph.task_pool_size(); + use_cache = graph.use_cache(); + if (use_cache) { + cache_size_limit = graph.cache_size_limit(); + cache_ttl = graph.cache_ttl(); + make_neighbor_sample_cache((size_t)cache_size_limit, (size_t)cache_ttl); + } _shards_task_pool.resize(task_pool_size_); for (size_t i = 0; i < _shards_task_pool.size(); ++i) { _shards_task_pool[i].reset(new ::ThreadPool(1)); diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index f6f127621b947c41122f7803a90f39b640713b8e..4c97cea23eaa277a81538e9c8aeacd6478bc9c51 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -547,6 +547,8 @@ class GraphTable : public SparseTable { std::unordered_set extra_nodes; std::unordered_map extra_nodes_to_thread_index; bool use_cache, use_duplicate_nodes; + int cache_size_limit; + int cache_ttl; mutable std::mutex mutex_; std::shared_ptr rw_lock; #ifdef PADDLE_WITH_HETERPS @@ -593,7 +595,7 @@ class BasicBfsGraphSampler : public GraphSampler { std::vector> sample_nodes; std::vector> sample_neighbors; size_t gpu_num; - int node_num_for_each_shard, edge_num_for_each_node; + int init_search_size, node_num_for_each_shard, edge_num_for_each_node; int rounds, interval; std::vector>> sample_neighbors_map; diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 565d51379d5a8519de241deea192ffbdbfa49fd0..a3f3c48581d6195569747cdf03ac389979caf7df 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -456,7 +456,7 @@ void RunBrpcPushSparse() { pull_status.wait(); ASSERT_EQ(_vs[0].size(), vs1[0].size()); - for (int j = 0; j < _vs[0].size(); j++) { + for (size_t j = 0; j < _vs[0].size(); j++) { ASSERT_EQ(_vs[0][j], vs1[0][j]); } } diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc index 65455028247ddf7d310040ecae0018b619f75bf1..2866bd0bda0253f99fe5cf7f82086236671388e1 100644 --- a/paddle/fluid/distributed/test/graph_table_sample_test.cc +++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc @@ -86,7 +86,7 @@ void testGraphSample() { #ifdef PADDLE_WITH_HETERPS ::paddle::distributed::GraphParameter table_proto; table_proto.set_gpups_mode(true); - table_proto.set_gpups_mode_shard_num(127); + table_proto.set_shard_num(127); table_proto.set_gpu_num(2); distributed::GraphTable graph_table, graph_table1; @@ -113,7 +113,7 @@ void testGraphSample() { ::paddle::distributed::GraphParameter table_proto1; table_proto1.set_gpups_mode(true); - table_proto1.set_gpups_mode_shard_num(127); + table_proto1.set_shard_num(127); table_proto1.set_gpu_num(2); table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler"); table_proto1.set_gpups_graph_sample_args("5,5,1,1"); diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6e1bee37a4e59232a78a8187cd986797c5f2ee42 --- /dev/null +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -0,0 +1,415 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import yaml +import re +import argparse +import os + +######################## +### Global Variables ### +######################## +ops_to_fill_zero_for_empty_grads = set(list("split")) + +# For API dispatch used at python-level +# { op_name : [arg_name, ...] } +core_ops_returns_info = {} +core_ops_args_info = {} +core_ops_args_type_info = {} + +yaml_types_mapping = { + 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ + 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'str' : 'std::string', \ + 'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ + 'int64[]' : 'std::vector', 'int[]' : 'std::vector', + 'Tensor' : 'Tensor', + 'Tensor[]' : 'std::vector', + 'Tensor[Tensor[]]' : 'std::vector>', + 'Scalar' : 'paddle::experimental::Scalar', + 'ScalarArray' : 'paddle::experimental::ScalarArray' +} + + +############################# +### File Reader Helpers ### +############################# +def ReadFwdFile(filepath): + f = open(filepath, 'r') + contents = yaml.load(f, Loader=yaml.FullLoader) + f.close() + return contents + + +def ReadBwdFile(filepath): + f = open(filepath, 'r') + contents = yaml.load(f, Loader=yaml.FullLoader) + ret = {} + for content in contents: + if 'backward_api' in content.keys(): + api_name = content['backward_api'] + else: + assert False + + ret[api_name] = content + f.close() + return ret + + +################################## +### Generic Helper Functions ### +################################## +def FindGradName(string): + return string + "_grad" + + +def FindForwardName(string): + if not string.endswith("_grad"): + return None + return string[:-5] + + +def IsPlainTensorType(string): + plain_tensor_types = ['Tensor&', 'Tensor', 'const Tensor&', 'const Tensor'] + if string in plain_tensor_types: + return True + return False + + +def IsVectorTensorType(string): + vector_tensor_types = [ + 'std::vector>', 'std::vector' + ] + if string in vector_tensor_types: + return True + return False + + +def GetSavedName(string): + return string + "_" + + +def GetConstReference(string): + ret = string + if not string.startswith("const "): + ret = "const " + string + if not string.endswith("&"): + ret += "&" + return ret + + +def RemoveConstAndReference(string): + ret = string + if string.startswith("const "): + ret = ret[6:] + if string.endswith("&"): + ret = ret[:-1] + + return ret + + +def GetGradNodeName(string): + return f"FinalGradNode{string}" + + +def GetDygraphForwardFunctionName(string): + return f"{string}_final_state_dygraph_function" + + +def GetIntermediateAPIFunctionName(string): + return string + "_intermediate" + + +def GetAutoGradMetaName(string): + return f"{string}_autograd_meta" + + +def GetAutoGradMetaVectorName(string): + return f"{string}_autograd_meta_vec" + + +def RemoveSpecialSymbolsInName(string): + # Remove any name after '@' + ret = string.split("@")[0] + return ret + + +def RecoverBaseNameOfInplaceFunction(function_name): + return function_name[:-1] + + +def GetInplacedFunctionName(function_name): + return function_name + "_" + + +def GetForwardFunctionName(string): + return f"{string}_final_state_dygraph_function" + + +###################### +### Yaml Parsers ### +###################### +def ParseYamlArgs(string): + # Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y + + # inputs_list = [ [arg_name, arg_type, orig_position], ...] + inputs_list = [] + # attrs_list = [ [arg_name, arg_type, default_value, orig_position], ...] + attrs_list = [] + + args = [x.strip() for x in string.strip().split(",")] + atype = r'((const )?\S+) ' + aname = r'(.*)' + pattern = f'{atype}{aname}' + for i in range(len(args)): + arg = args[i] + m = re.search(pattern, arg) + arg_type = m.group(1).strip() + arg_name = m.group(3).split("=")[0].strip() + default_value = m.group(3).split("=")[1].strip() if len( + m.group(3).split("=")) > 1 else None + + assert arg_type in yaml_types_mapping.keys( + ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping." + arg_type = yaml_types_mapping[arg_type] + + arg_name = RemoveSpecialSymbolsInName(arg_name) + if "Tensor" in arg_type: + assert default_value is None + inputs_list.append([arg_name, arg_type, i]) + else: + attrs_list.append([arg_name, arg_type, default_value, i]) + + return inputs_list, attrs_list + + +def ParseYamlReturns(string): + # Example0: Tensor(out), Tensor(out1) + # Example1: Tensor, Tensor + # Example2: Tensor[](out), Tensor + + # list = [ [ret_name, ret_type, orig_position], ...] + returns_list = [] + + returns = [x.strip() for x in string.strip().split(",")] + + for i in range(len(returns)): + ret = returns[i] + + ret_name = "" + if "(" in ret and ")" in ret: + # Remove trailing ')' + ret = ret[:-1] + ret_type = ret.split("(")[0].strip() + ret_name = ret.split("(")[1].strip() + else: + ret_type = ret.strip() + + assert ret_type in yaml_types_mapping.keys( + ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping." + ret_type = yaml_types_mapping[ret_type] + + assert "Tensor" in ret_type + ret_name = RemoveSpecialSymbolsInName(ret_name) + returns_list.append([ret_name, ret_type, i]) + + return returns_list + + +def ParseYamlForwardFromBackward(string): + # Example: matmul (const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y) -> Tensor(out) + + fname = r'(.*?)' + wspace = r'\s*' + fargs = r'(.*?)' + frets = r'(.*)' + pattern = f'{fname}{wspace}\({wspace}{fargs}{wspace}\){wspace}->{wspace}{frets}' + + m = re.search(pattern, string) + function_name = m.group(1) + function_args = m.group(2) + function_returns = m.group(3) + + forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args) + forward_returns_list = ParseYamlReturns(function_returns) + + return forward_inputs_list, forward_attrs_list, forward_returns_list + + +def ParseYamlForward(args_str, returns_str): + # args Example: (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false) + # returns Example: Tensor, Tensor + + fargs = r'(.*?)' + wspace = r'\s*' + args_pattern = f'\({fargs}\)' + args_str = re.search(args_pattern, args_str).group(1) + + inputs_list, attrs_list = ParseYamlArgs(args_str) + returns_list = ParseYamlReturns(returns_str) + + return inputs_list, attrs_list, returns_list + + +def ParseYamlBackward(args_str, returns_str): + # args Example: (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false) + # returns Example: Tensor(x_grad), Tensor(y_grad) + + fargs = r'(.*?)' + wspace = r'\s*' + args_pattern = f'\({fargs}\)' + args_str = re.search(args_pattern, args_str).group(1) + + inputs_list, attrs_list = ParseYamlArgs(args_str) + returns_list = ParseYamlReturns(returns_str) + + return inputs_list, attrs_list, returns_list + + +######################## +### Generator Base ### +######################## +class FunctionGeneratorBase: + def __init__(self, forward_api_contents, namespace): + self.forward_api_contents = forward_api_contents + self.namespace = namespace + + self.forward_api_name = "" + + self.orig_forward_inputs_list = [ + ] #[ [arg_name, arg_type, orig_position], ...] + self.orig_forward_attrs_list = [ + ] #[ [attr_name, attr_type, default_value, orig_position], ...] + self.orig_forward_returns_list = [ + ] #[ [ret_name, ret_type, orig_position], ...] + + # Processed Forward Data + self.forward_inputs_position_map = { + } #{ "name" : [type, fwd_position] } + self.forward_outputs_position_map = { + } #{ "name" : [type, fwd_position] } + + # Special Op Attributes + self.optional_inputs = [] #[name, ...] + self.no_need_buffers = [] #[name, ...] + self.intermediate_outputs = [] #[name, ...] + self.inplace_map = {} #{name : name, ...} + + def ParseInplaceInfo(self): + forward_api_contents = self.forward_api_contents + if 'inplace' not in forward_api_contents.keys(): return + + # inplace_map_str: "(x -> out0), (y -> out2)" + inplace_map_str = forward_api_contents['inplace'] + for pair in inplace_map_str.split(","): + pair = pair.strip() + if pair.startswith("("): + pair = pair[1:] + + if pair.endswith(")"): + pair = pair[:-1] + + key = pair.split("->")[0].strip() + val = pair.split("->")[1].strip() + self.inplace_map[key] = val + + def ParseNoNeedBuffer(self): + forward_api_contents = self.forward_api_contents + + if 'no_need_buffer' in forward_api_contents.keys(): + no_need_buffer_str = forward_api_contents['no_need_buffer'] + for name in no_need_buffer_str.split(","): + name = name.strip() + name = RemoveSpecialSymbolsInName(name) + self.no_need_buffers.append(name.strip()) + + def ParseDispensable(self): + forward_api_contents = self.forward_api_contents + + if 'optional' in forward_api_contents.keys(): + optional_inputs_str = forward_api_contents['optional'] + for name in optional_inputs_str.split(","): + name = name.strip() + name = RemoveSpecialSymbolsInName(name) + self.optional_inputs.append(name) + + def ParseIntermediate(self): + forward_api_contents = self.forward_api_contents + + if 'intermediate' in forward_api_contents.keys(): + intermediate_str = forward_api_contents['intermediate'] + for name in intermediate_str.split(","): + name = name.strip() + name = RemoveSpecialSymbolsInName(name) + self.intermediate_outputs.append(name) + + def CollectOriginalForwardInfo(self): + forward_api_contents = self.forward_api_contents + + self.forward_api_name = forward_api_contents['api'] + forward_args_str = forward_api_contents['args'] + forward_returns_str = forward_api_contents['output'] + + assert 'api' in forward_api_contents.keys( + ), "Unable to find \"api\" in forward_api_contents keys" + assert 'args' in forward_api_contents.keys( + ), "Unable to find \"args\" in forward_api_contents keys" + assert 'output' in forward_api_contents.keys( + ), "Unable to find \"output\" in forward_api_contents keys" + + # Collect Original Forward Inputs/Outputs and then perform validation checks + self.orig_forward_inputs_list, self.orig_forward_attrs_list, self.orig_forward_returns_list = ParseYamlForward( + forward_args_str, forward_returns_str) + + def DetermineForwardPositionMap(self, forward_inputs_list, + forward_returns_list): + for i in range(len(forward_inputs_list)): + forward_input = forward_inputs_list[i] + input_name = forward_input[0] + input_type = forward_input[1] + input_pos = forward_input[2] + + self.forward_inputs_position_map[ + input_name] = [input_type, input_pos] + + for i in range(len(forward_returns_list)): + forward_return = forward_returns_list[i] + return_name = forward_return[0] + return_type = forward_return[1] + return_pos = forward_return[2] + + self.forward_outputs_position_map[ + return_name] = [return_type, return_pos] + print("Generated Forward Input Position Map: ", + self.forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + self.forward_outputs_position_map) + + +class YamlGeneratorBase: + def __init__(self, api_yaml_path): + self.namespace = "" + self.api_yaml_path = api_yaml_path + + self.forward_api_list = [] + + def ParseForwardYamlContents(self): + api_yaml_path = self.api_yaml_path + self.forward_api_list = ReadFwdFile(api_yaml_path) + + def InferNameSpace(self): + api_yaml_path = self.api_yaml_path + if "sparse" in api_yaml_path: + self.namespace = "sparse::" diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 1d18cbe782948b5d4eda6b65e49dc839f799b76e..fd750c0d07369e104291a981514137f02115c026 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -16,31 +16,25 @@ import yaml import re import argparse import os - -ops_to_fill_zero_for_empty_grads = set(list("split")) - -# For API dispatch used at python-level -# { op_name : [arg_name, ...] } -core_ops_returns_info = {} -core_ops_args_info = {} -core_ops_args_type_info = {} - -namespace = "" - -yaml_types_mapping = { - 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ - 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ - 'str' : 'std::string', \ - 'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ - 'int64[]' : 'std::vector', 'int[]' : 'std::vector', - 'Tensor' : 'Tensor', - 'Tensor[]' : 'std::vector', - 'Tensor[Tensor[]]' : 'std::vector>', - 'Scalar' : 'paddle::experimental::Scalar', - 'ScalarArray' : 'paddle::experimental::ScalarArray' -} - - +from codegen_utils import core_ops_returns_info, core_ops_args_info, core_ops_args_type_info +from codegen_utils import yaml_types_mapping +from codegen_utils import ReadFwdFile, ReadBwdFile +from codegen_utils import FindGradName, FindForwardName, GetSavedName, GetGradNodeName +from codegen_utils import IsPlainTensorType, IsVectorTensorType +from codegen_utils import GetConstReference, RemoveConstAndReference +from codegen_utils import GetDygraphForwardFunctionName, GetIntermediateAPIFunctionName +from codegen_utils import GetAutoGradMetaName, GetAutoGradMetaVectorName +from codegen_utils import RemoveSpecialSymbolsInName, RecoverBaseNameOfInplaceFunction +from codegen_utils import GetInplacedFunctionName +from codegen_utils import ParseYamlArgs, ParseYamlReturns, ParseYamlForwardFromBackward +from codegen_utils import ParseYamlForward, ParseYamlBackward +from codegen_utils import FunctionGeneratorBase, YamlGeneratorBase +from codegen_utils import ops_to_fill_zero_for_empty_grads + + +########### +## Utils ## +########### def ParseArguments(): parser = argparse.ArgumentParser( description='Eager Code Generator Args Parser') @@ -55,845 +49,129 @@ def ParseArguments(): return args -################# -### Helpers ### -################# -def RecoverBaseNameOfInplaceFunction(function_name): - return function_name[:-1] - - -def GetInplacedFunctionName(function_name): - return function_name + "_" - - -def FindGradName(string): - return string + "_grad" - - -def FindForwardName(string): - if not string.endswith("_grad"): - return None - return string[:-5] - - -def IsPlainTensorType(string): - plain_tensor_types = ['Tensor&', 'Tensor', 'const Tensor&', 'const Tensor'] - if string in plain_tensor_types: - return True - return False - - -def IsVectorTensorType(string): - vector_tensor_types = [ - 'std::vector>', 'std::vector' - ] - if string in vector_tensor_types: - return True - return False - - -def GetSavedName(string): - return string + "_" - - -def GetConstReference(string): - ret = string - if not string.startswith("const "): - ret = "const " + string - if not string.endswith("&"): - ret += "&" - return ret - - -def RemoveConstAndReference(string): - ret = string - if string.startswith("const "): - ret = ret[6:] - if string.endswith("&"): - ret = ret[:-1] - - return ret - - -def GetGradNodeName(string): - return f"FinalGradNode{string}" - - -def GetForwardFunctionName(string): - return f"{string}_final_state_dygraph_function" - - -def GetAutoGradMetaName(string): - return f"{string}_autograd_meta" - - -def GetAutoGradMetaVectorName(string): - return f"{string}_autograd_meta_vec" - - -###################### -### File Readers ### -###################### -def ReadFwdFile(filepath): - f = open(filepath, 'r') - contents = yaml.load(f, Loader=yaml.FullLoader) - f.close() - return contents - - -def ReadBwdFile(filepath): - f = open(filepath, 'r') - contents = yaml.load(f, Loader=yaml.FullLoader) - ret = {} - for content in contents: - if 'backward_api' in content.keys(): - api_name = content['backward_api'] - else: - assert False - - ret[api_name] = content - f.close() - return ret - - -###################### -### Yaml Parsers ### -###################### -def ParseInplaceInfo(string): - # string: "(x -> out0), (y -> out2)" - inplace_map = {} - for pair in string.split(","): - pair = pair.strip() - if pair.startswith("("): - pair = pair[1:] - - if pair.endswith(")"): - pair = pair[:-1] - - key = pair.split("->")[0].strip() - val = pair.split("->")[1].strip() - inplace_map[key] = val - - return inplace_map - - -def RemoveSpecialSymbolsInName(string): - # Remove any name after '@' - ret = string.split("@")[0] - return ret - - -def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): - # intermediate_outputs : [name0, name1, ...] - # forward_returns_list : [[ret_name, type, orig_pos], ...] - """ - Check whether intermediate_outputs are positioned - at the very end of forward_returns_list - """ - - intermediate_positions = range( - len(forward_returns_list) - len(intermediate_outputs), - len(forward_returns_list)) - for ret_name, _, pos in forward_returns_list: - if ret_name in intermediate_outputs: - assert pos in intermediate_positions - - -def ParseDispensable(string): - # string: "X, Y" - string = RemoveSpecialSymbolsInName(string) - return [v.strip() for v in string.split(",")] - - -def ParseIntermediate(string): - string = RemoveSpecialSymbolsInName(string) - return [v.strip() for v in string.split(",")] - - -def ParseNoNeedBuffer(string): - # string: "x, y" - string = RemoveSpecialSymbolsInName(string) - - no_need_buffer_set = set() - for name in string.split(","): - no_need_buffer_set.add(name.strip()) - - return no_need_buffer_set - - -def ParseYamlArgs(string): - # Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y - - # inputs_list = [ [arg_name, arg_type, orig_position], ...] - inputs_list = [] - # attrs_list = [ [arg_name, arg_type, default_value, orig_position], ...] - attrs_list = [] - - args = [x.strip() for x in string.strip().split(",")] - atype = r'((const )?\S+) ' - aname = r'(.*)' - pattern = f'{atype}{aname}' - for i in range(len(args)): - arg = args[i] - m = re.search(pattern, arg) - arg_type = m.group(1).strip() - arg_name = m.group(3).split("=")[0].strip() - default_value = m.group(3).split("=")[1].strip() if len( - m.group(3).split("=")) > 1 else None - - assert arg_type in yaml_types_mapping.keys( - ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping." - arg_type = yaml_types_mapping[arg_type] - - arg_name = RemoveSpecialSymbolsInName(arg_name) - if "Tensor" in arg_type: - assert default_value is None - inputs_list.append([arg_name, arg_type, i]) - else: - attrs_list.append([arg_name, arg_type, default_value, i]) - - return inputs_list, attrs_list - - -def ParseYamlReturns(string): - # Example0: Tensor(out), Tensor(out1) - # Example1: Tensor, Tensor - # Example2: Tensor[](out), Tensor - - # list = [ [ret_name, ret_type, orig_position], ...] - returns_list = [] - - returns = [x.strip() for x in string.strip().split(",")] - - for i in range(len(returns)): - ret = returns[i] - - ret_name = "" - if "(" in ret and ")" in ret: - # Remove trailing ')' - ret = ret[:-1] - ret_type = ret.split("(")[0].strip() - ret_name = ret.split("(")[1].strip() - else: - ret_type = ret.strip() - - assert ret_type in yaml_types_mapping.keys( - ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping." - ret_type = yaml_types_mapping[ret_type] - - assert "Tensor" in ret_type - ret_name = RemoveSpecialSymbolsInName(ret_name) - returns_list.append([ret_name, ret_type, i]) - - return returns_list - - -def ParseYamlForwardFromBackward(string): - # Example: matmul (const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y) -> Tensor(out) - - fname = r'(.*?)' - wspace = r'\s*' - fargs = r'(.*?)' - frets = r'(.*)' - pattern = f'{fname}{wspace}\({wspace}{fargs}{wspace}\){wspace}->{wspace}{frets}' - - m = re.search(pattern, string) - function_name = m.group(1) - function_args = m.group(2) - function_returns = m.group(3) - - forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args) - forward_returns_list = ParseYamlReturns(function_returns) - - return forward_inputs_list, forward_attrs_list, forward_returns_list - - -def ParseYamlForward(args_str, returns_str): - # args Example: (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false) - # returns Example: Tensor, Tensor - - fargs = r'(.*?)' - wspace = r'\s*' - args_pattern = f'\({fargs}\)' - args_str = re.search(args_pattern, args_str).group(1) - - inputs_list, attrs_list = ParseYamlArgs(args_str) - returns_list = ParseYamlReturns(returns_str) - - return inputs_list, attrs_list, returns_list - - -def ParseYamlBackward(args_str, returns_str): - # args Example: (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false) - # returns Example: Tensor(x_grad), Tensor(y_grad) - - fargs = r'(.*?)' - wspace = r'\s*' - args_pattern = f'\({fargs}\)' - args_str = re.search(args_pattern, args_str).group(1) - - inputs_list, attrs_list = ParseYamlArgs(args_str) - returns_list = ParseYamlReturns(returns_str) - - return inputs_list, attrs_list, returns_list - - -####################### -### Preprocessing ### -####################### -def ForwardsValidationCheck(forward_inputs_list, forward_attrs_list, - forward_returns_list, orig_forward_inputs_list, - orig_forward_attrs_list, orig_forward_returns_list): - for i in range(len(forward_inputs_list)): - forward_input_name = forward_inputs_list[i][0] - forward_input_type = forward_inputs_list[i][1] - forward_input_pos = forward_inputs_list[i][2] - orig_input_name = orig_forward_inputs_list[i][0] - orig_input_type = orig_forward_inputs_list[i][1] - orig_input_pos = orig_forward_inputs_list[i][2] - - assert forward_input_type == orig_input_type - assert forward_input_pos == orig_input_pos - - for i in range(len(forward_attrs_list)): - orig_attr_name = orig_forward_attrs_list[i][0] - orig_attr_type = orig_forward_attrs_list[i][1] - orig_attr_default = orig_forward_attrs_list[i][2] - orig_attr_pos = orig_forward_attrs_list[i][3] - forward_attr_name = forward_attrs_list[i][0] - forward_attr_type = forward_attrs_list[i][1] - forward_attr_default = forward_attrs_list[i][2] - forward_attr_pos = forward_attrs_list[i][3] - assert orig_attr_type == forward_attr_type - assert orig_attr_default == forward_attr_default - assert orig_attr_pos == forward_attr_pos - - for i in range(len(forward_returns_list)): - orig_return_type = orig_forward_returns_list[i][1] - orig_return_pos = orig_forward_returns_list[i][2] - forward_return_type = forward_returns_list[i][1] - forward_return_pos = forward_returns_list[i][2] - - assert orig_return_type == forward_return_type - assert orig_return_pos == forward_return_pos - - # Check Order: Inputs, Attributes - max_input_position = -1 - for _, _, pos in forward_inputs_list: - max_input_position = max(max_input_position, pos) - - max_attr_position = -1 - for _, _, _, pos in forward_attrs_list: - assert pos > max_input_position - max_attr_position = max(max_attr_position, pos) - - -def BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map, - backward_attrs_list): - - # Check Order: TensorWrappers, GradTensors, Attributes - max_fwd_input_position = -1 - for _, (_, _, pos) in backward_fwd_input_map.items(): - max_fwd_input_position = max(max_fwd_input_position, pos) - - max_grad_tensor_position = -1 - for _, (_, _, pos) in backward_grad_input_map.items(): - assert pos > max_fwd_input_position - max_grad_tensor_position = max(max_grad_tensor_position, pos) - - max_attr_position = -1 - for _, _, _, pos in backward_attrs_list: - assert pos > max_grad_tensor_position - max_attr_position = max(max_attr_position, pos) - - -def DetermineForwardPositionMap(forward_inputs_list, forward_returns_list): - forward_inputs_position_map = {} - forward_outputs_position_map = {} - for i in range(len(forward_inputs_list)): - forward_input = forward_inputs_list[i] - input_name = forward_input[0] - input_type = forward_input[1] - input_pos = forward_input[2] - - forward_inputs_position_map[input_name] = [input_type, input_pos] - - for i in range(len(forward_returns_list)): - forward_return = forward_returns_list[i] - return_name = forward_return[0] - return_type = forward_return[1] - return_pos = forward_return[2] - - forward_outputs_position_map[return_name] = [return_type, return_pos] - - return forward_inputs_position_map, forward_outputs_position_map - - -def SlotNameMatching(backward_inputs_list, backward_returns_list, - forward_inputs_position_map, forward_outputs_position_map): - - backward_fwd_input_map = {} - backward_grad_input_map = {} - backward_grad_output_map = {} - - for backward_input in backward_inputs_list: - backward_input_name = backward_input[0] - backward_input_type = backward_input[1] - backward_input_pos = backward_input[2] - - backward_fwd_name = FindForwardName(backward_input_name) - if backward_fwd_name: - # Grad Input - assert backward_fwd_name in forward_outputs_position_map.keys() - matched_forward_output_type = forward_outputs_position_map[ - backward_fwd_name][0] - matched_forward_output_pos = forward_outputs_position_map[ - backward_fwd_name][1] - - backward_grad_input_map[backward_input_name] = [ - backward_input_type, matched_forward_output_pos, - backward_input_pos - ] - else: - # TensorWrapper Input - if backward_input_name in forward_inputs_position_map.keys(): - tensor_wrapper_type = forward_inputs_position_map[ - backward_input_name][0] - backward_fwd_input_map[backward_input_name] = [ - backward_input_type, True, backward_input_pos - ] - - elif backward_input_name in forward_outputs_position_map.keys(): - tensor_wrapper_type = forward_outputs_position_map[ - backward_input_name][0] - backward_fwd_input_map[backward_input_name] = [ - backward_input_type, False, backward_input_pos - ] - else: - assert False, backward_input_name - - for backward_output in backward_returns_list: - backward_output_name = backward_output[0] - backward_output_type = backward_output[1] - backward_output_pos = backward_output[2] - - backward_fwd_name = FindForwardName(backward_output_name) - assert backward_fwd_name is not None - assert backward_fwd_name in forward_inputs_position_map.keys( - ), backward_fwd_name - - matched_forward_input_type = forward_inputs_position_map[ - backward_fwd_name][0] - matched_forward_input_pos = forward_inputs_position_map[ - backward_fwd_name][1] - - backward_grad_output_map[backward_output_name] = [ - backward_output_type, matched_forward_input_pos, backward_output_pos - ] - - return backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map - - -def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, - backward_attrs_list, no_need_buffer_set): - # Inputs: - # fwd_api_name = "" - # backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...} - # backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - - # Determine Node Name - forward_op_name = fwd_api_name - - # SetTensorWrapper Methods & TensorWrapper Members - set_tensor_wrapper_methods_str = "" - tensor_wrapper_members_str = "" - clear_tensor_wrapper_str = "" - for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items(): - if tname in no_need_buffer_set: - no_need_buffer = "true" - else: - no_need_buffer = "false" - - tensor_wrapper_name = GetSavedName(tname) - if IsPlainTensorType(ttype): - SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """ +######################## +## Code Gen Templates ## +######################## +SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = \ +""" void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{ {} = egr::TensorWrapper({}, full_reserved, {}); }} """ - set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format( - tname, tname, tensor_wrapper_name, tname, no_need_buffer) - PLAIN_TENSOR_MEMBER_TEMPLATE = """ - egr::TensorWrapper {}; +PLAIN_TENSOR_MEMBER_TEMPLATE = \ +""" + egr::TensorWrapper {}; """ - tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format( - tensor_wrapper_name) - CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ - {}.clear(); +CLEAR_TENSOR_WRAPPER_TEMPLATE = \ +""" + {}.clear(); """ - clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( - tensor_wrapper_name) - else: - assert IsVectorTensorType(ttype) - SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """ - void SetTensorWrapper{}(const std::vector& {}, bool full_reserved) {{ - for(const auto& eager_tensor : {}) {{ - {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) ); - }}; - }} +SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = \ +""" + void SetTensorWrapper{}(const std::vector& {}, bool full_reserved) {{ + for(const auto& eager_tensor : {}) {{ + {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) ); + }}; + }} """ - set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format( - tname, tname, tname, tensor_wrapper_name, no_need_buffer) - VECTOR_TENSOR_MEMBER_TEMPLATE = """ - std::vector {}; +VECTOR_TENSOR_MEMBER_TEMPLATE = \ +""" + std::vector {}; """ - tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format( - tensor_wrapper_name) - CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ - for (auto tw: {}) { - tw.clear(); - }; +CLEAR_VECTOR_TENSOR_WRAPPERS_TEMPLATE = \ """ - clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( - tensor_wrapper_name) - - # End: SetTensorWrapper Methods & TensorWrapper Members - - # SetAttributes & Attribute Members - set_attribute_methods_str = "" - attribute_members_str = "" - for aname, atype, default_val, _ in backward_attrs_list: - saved_attr_name = GetSavedName(aname) - SET_ATTR_METHOD_TEMPLATE = """ - void SetAttribute{}({} {}) {{ - {} = {}; - }} + for (auto tw: {}) { + tw.clear(); + }; """ - set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format( - aname, GetConstReference(atype), aname, saved_attr_name, aname) - if default_val: - ATTRIBUTE_MEMBER_TEMPLATE = """ +SET_ATTR_METHOD_TEMPLATE = \ +""" + void SetAttribute{}({} {}) {{ + {} = {}; + }} +""" + +ATTRIBUTE_MEMBER_WITH_DEFAULT_TEMPLATE = \ +""" {} {} = {}; - """ - attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( - RemoveConstAndReference(atype), saved_attr_name, default_val) - else: - ATTRIBUTE_MEMBER_TEMPLATE = """ +""" + +ATTRIBUTE_MEMBER_TEMPLATE = \ +""" {} {}; - """ - attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( - RemoveConstAndReference(atype), saved_attr_name) - # End: SetAttributes & Attribute Members - - grad_node_name = GetGradNodeName(fwd_api_name) - NODE_DECLARATION_TEMPLATE = """ -class {} : public egr::GradNodeBase {{ - public: - {}() : egr::GradNodeBase() {{}} - {}(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : - egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {{}} - ~{}() override = default; - - virtual std::vector> operator()( - std::vector>& grads, bool create_graph = false) override; - - std::string name() override {{ return \" {} \"; }} - - void ClearTensorWrappers() override {{ - {} - is_tensor_wrappers_cleared = true; - }} - - // SetTensorWrapperX, SetTensorWrapperY, ... - {} - // SetAttributes - {} - - bool IsTensorWrappersCleared() override {{ - return is_tensor_wrappers_cleared; - }} - private: - // TensorWrappers - {} - - bool is_tensor_wrappers_cleared = false; - - // Attributes - {} -}}; """ - node_declaration_str = NODE_DECLARATION_TEMPLATE.format( - grad_node_name, grad_node_name, grad_node_name, grad_node_name, - grad_node_name, clear_tensor_wrapper_str, - set_tensor_wrapper_methods_str, set_attribute_methods_str, - tensor_wrapper_members_str, attribute_members_str) - - return node_declaration_str - - -def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, - backward_grad_input_map, backward_grad_output_map, - backward_attrs_list): - # fwd_api_name = "" - # backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...} - # backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...} - # backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...} - # backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - - # Construct grad_api function args - # Order: TensorWrappers, GradTensors, Attributes - grad_api_args_len = len(backward_fwd_input_map.keys()) + len( - backward_grad_input_map.keys()) + len(backward_attrs_list) - grad_api_args = ["" for i in range(grad_api_args_len)] - for name, (_, is_fwd_input, - grad_api_position), in backward_fwd_input_map.items(): - tensor_wrapper_name = GetSavedName(name) - grad_api_args[ - grad_api_position] = f"egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, nullptr)" - - for _, (ttype, fwd_position, - grad_api_position) in backward_grad_input_map.items(): - if IsPlainTensorType(ttype): - grad_api_args[ - grad_api_position] = f"hooked_grads[{fwd_position}][0]" - else: - assert IsVectorTensorType(ttype) - grad_api_args[grad_api_position] = f"hooked_grads[{fwd_position}]" - - for name, _, _, grad_api_position in backward_attrs_list: - saved_attribute_name = GetSavedName(name) - grad_api_args[grad_api_position] = f"this->{saved_attribute_name}" - grad_api_args_str = ", ".join(grad_api_args) - - # Construct grad_api returns - num_bwd_outputs = len(backward_grad_output_map.keys()) - returns_str = f"std::vector> returns({num_bwd_outputs});\n" - for _, (ttype, fwd_position, - grad_api_position) in backward_grad_output_map.items(): - # Infer Grad API Return Type - if num_bwd_outputs == 1: - # Single tensor output, return as is - if IsPlainTensorType(ttype): - returns_str += "returns[0] = { grad_api_returns };\n" - else: - assert IsVectorTensorType(ttype) - returns_str += "returns[0] = grad_api_returns;\n" - else: - # Rearrange output order accordingly - returns_str += f"returns[{fwd_position}] = grad_api_returns[{grad_api_position}];\n" - returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n" - returns_str += f"return returns;\n" - grad_node_name = GetGradNodeName(fwd_api_name) +NODE_DECLARATION_TEMPLATE = \ +""" + class {} : public egr::GradNodeBase {{ + public: + {}() : egr::GradNodeBase() {{}} + {}(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : + egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {{}} + ~{}() override = default; + + virtual std::vector> operator()( + std::vector>& grads, bool create_graph = false) override; + std::string name() override {{ return \" {} \"; }} + + void ClearTensorWrappers() override {{ + {} + is_tensor_wrappers_cleared = true; + }} + + // SetTensorWrapperX, SetTensorWrapperY, ... + {} + // SetAttributes + {} - fill_zero_str = "" - if fwd_api_name in ops_to_fill_zero_for_empty_grads: - fill_zero_str = "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n" + bool IsTensorWrappersCleared() override {{ + return is_tensor_wrappers_cleared; + }} + private: + // TensorWrappers + {} - if len(namespace) > 0: - grad_api_namespace = f"paddle::experimental::{namespace}" - else: - grad_api_namespace = f"paddle::experimental" + bool is_tensor_wrappers_cleared = false; - FUNCTION_TEMPLATE = """ -std::vector> {}::operator()(std::vector>& grads, bool create_graph) {{ - {} - auto hooked_grads = ApplyGradientHooks(grads); - - // Call grad_api function - VLOG(3) << \"Final State Running: \" << \"{}\"; - auto grad_api_returns = {}::{}({}); - {} -}} - """ - - node_definition_str = FUNCTION_TEMPLATE.format( - grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace, - bwd_api_name, grad_api_args_str, returns_str) - - return node_definition_str - - -def GenerateNodeCreationCodes( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, forward_call_str, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs, - inplace_map): - # fwd_api_name = "" - # forward_inputs_position_map = { "name" : [type, fwd_position] } - # forward_outputs_position_map = { "name" : [type, fwd_position] } - # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - # backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...} - # backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...} - # backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...} - # backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - - # Get Input AutoGradMeta - inputs_autograd_meta_list = [] - compute_require_grad_args_list = ["trace_backward"] - for name, (ttype, pos) in forward_inputs_position_map.items(): - input_autograd_meta_name = GetAutoGradMetaName(name) - if IsPlainTensorType(ttype): - input_autograd_meta = f" egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});" - else: - assert IsVectorTensorType(ttype) - input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name) - input_autograd_meta = f" std::vector {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n" - input_autograd_meta += f" std::vector* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};" - - inputs_autograd_meta_list.append(input_autograd_meta) - compute_require_grad_args_list.append(input_autograd_meta_name) - inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list) - compute_require_grad_args_str = ",".join(compute_require_grad_args_list) - - # Get Output AutoGradMeta - outputs_autograd_meta_list = [] - pass_stop_gradient_args_list = ["false"] - num_fwd_outputs = len(forward_outputs_position_map.keys()) - for name, (rtype, pos) in forward_outputs_position_map.items(): - output_autograd_meta_name = GetAutoGradMetaName(name) - output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name) - if num_fwd_outputs == 1: - if IsPlainTensorType(rtype): - output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);" - else: - assert IsVectorTensorType(rtype) - output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n" - output_autograd_meta += f" std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};" - else: - # Tuple api_result - if IsPlainTensorType(rtype): - output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));" - else: - assert IsVectorTensorType(rtype) - output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n" - output_autograd_meta += f" std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};" - - outputs_autograd_meta_list.append(output_autograd_meta) - pass_stop_gradient_args_list.append(output_autograd_meta_name) - - # ComputeRequireGrad & PassStopGradient - outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list) - pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list) - - # Check Inplace - check_inplace_str = "" - bump_inplace_version_str = "" - for inplace_name in inplace_map.keys(): - inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name) - check_inplace_str += f""" - // Check Inplace - egr::EagerUtils::CheckInplace({inplace_name}, {inplace_autograd_meta_name}, require_any_grad);\n + // Attributes + {} + }}; """ - bump_inplace_version_str += f""" - // Bump Inplace Version - {inplace_name}.bump_inplace_version(); - VLOG(3) << \"Tensor(\" << {inplace_name}.name() << \") uses Inplace Strategy.\";\n +FUNCTION_TEMPLATE = \ +""" + std::vector> {}::operator()(std::vector>& grads, bool create_graph) {{ + {} + auto hooked_grads = ApplyGradientHooks(grads); + + // Call grad_api function + VLOG(3) << \"Final State Running: \" << \"{}\"; + auto grad_api_returns = {}{}({}); + {} + }} """ - # Node Construction - num_bwd_inputs = len(backward_grad_input_map.keys()) - num_bwd_outputs = len(backward_grad_output_map.keys()) - grad_node_name = GetGradNodeName( - RecoverBaseNameOfInplaceFunction( - fwd_api_name)) if inplace_map else GetGradNodeName(fwd_api_name) - node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});" - - # SetAttributes - set_attributes_list = [] - forward_attrs_name_set = set() - for name, _, _, _ in forward_attrs_list: - forward_attrs_name_set.add(name) - - for name, _, default_val_attr, _ in backward_attrs_list: - if name in forward_attrs_name_set: - set_attributes = f" grad_node->SetAttribute{name}({name});" - else: - set_attributes = f" grad_node->SetAttribute{name}({default_val_attr});" - set_attributes_list.append(set_attributes) - set_attributes_str = "\n".join(set_attributes_list) - - # SetTensorWrappers - set_tensor_wrappers_list = [] - for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items(): - is_optional = (name in optional_inputs) - - if is_fwd_input: - if is_optional: - set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" - else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" - else: - if num_fwd_outputs > 1: - # Aligned with forward output position - assert name in forward_outputs_position_map.keys() - fwd_output_pos = forward_outputs_position_map[name][1] - tw_name = f"std::get<{fwd_output_pos}>(api_result)" - else: - tw_name = f"api_result" +FORWARD_FUNCTION_TEMPLATE = \ +""" + {} {}({}) {{ + {} + + {} - if is_optional: - set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);" - else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);" - set_tensor_wrappers_list.append(set_tensor_wrappers) - set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) - - # SetGradOutMeta & SetEdges - set_grad_out_meta_list = [] - set_edges_list = [] - for name, (_, pos) in forward_inputs_position_map.items(): - input_autograd_meta_name = GetAutoGradMetaName(name) - set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});" - set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});" - set_grad_out_meta_list.append(set_grad_out_meta) - set_edges_list.append(set_edges) - set_grad_out_meta_str = "\n".join(set_grad_out_meta_list) - set_edges_str = "\n".join(set_edges_list) - - # SetOutRank & SetHistory & SetGradInMeta - set_out_rank_list = [] - set_history_list = [] - set_grad_in_meta_list = [] - set_retain_grad_list = [] - num_outputs = len(forward_outputs_position_map.keys()) - for name, (_, pos) in forward_outputs_position_map.items(): - output_autograd_meta_name = GetAutoGradMetaName(name) - set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});" - set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);" - if num_outputs == 1: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" - set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});" - else: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));" - set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});" - - set_out_rank_list.append(set_out_rank) - set_history_list.append(set_history) - set_grad_in_meta_list.append(set_grad_in_meta) - set_retain_grad_list.append(set_retain_grad) - - set_out_rank_str = "\n".join(set_out_rank_list) - set_history_str = "\n".join(set_history_list) - set_grad_in_meta_str = "\n".join(set_grad_in_meta_list) - set_retain_grad_str = "\n".join(set_retain_grad_list) - - node_event_name = fwd_api_name + " node_creation" - NODE_CREATION_TEMPLATE = """ - paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n - """ - node_creation_event_str = NODE_CREATION_TEMPLATE.format(node_event_name) + // Returns + return {}; + }} - NODE_CREATION_TEMPLATE = """ +""" +NODE_CREATION_TEMPLATE = \ +""" // Get AutoGradMeta {} bool trace_backward = egr::Controller::Instance().HasGrad(); @@ -924,185 +202,72 @@ def GenerateNodeCreationCodes( {} }} }} +""" +NAMESPACE_WRAPPER_TEMPLATE = \ +""" +namespace {} {{ + {} +}} """ - node_creation_str = NODE_CREATION_TEMPLATE.format( - inputs_autograd_meta_str, compute_require_grad_args_str, - check_inplace_str, forward_call_str, bump_inplace_version_str, - node_creation_event_str, outputs_autograd_meta_str, - pass_stop_gradient_args_str, node_construction_str, set_attributes_str, - set_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str, - set_out_rank_str, set_history_str, set_grad_in_meta_str, - set_retain_grad_str) - - return node_creation_str - - -def GenerateForwardDefinition( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs, - intermediate_outputs, inplace_map): - # fwd_api_name = "" - # forward_inputs_position_map = { "name" : [type, fwd_position] } - # forward_outputs_position_map = { "name" : [type, fwd_position] } - # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - # backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...} - # backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...} - # backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...} - # backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - # optional_inputs = ["name0", ...] - - # Get Function Args - num_inputs = len(forward_attrs_list) + len(forward_inputs_position_map.keys( - )) - inputs_args_definition_list = ["" for i in range(num_inputs)] - inputs_args_declaration_list = ["" for i in range(num_inputs)] - inputs_call_list = ["" for i in range(num_inputs)] - for name, (ttype, pos) in forward_inputs_position_map.items(): - inputs_call_list[pos] = f"{name}" - is_optional = (name in optional_inputs) - if IsPlainTensorType(ttype): - if is_optional: - arg_str = f"const paddle::optional& {name}" - else: - if inplace_map and name in inplace_map.keys(): - arg_str = f"paddle::experimental::Tensor& {name}" - else: - arg_str = f"const paddle::experimental::Tensor& {name}" - else: - assert IsVectorTensorType(ttype) - arg_str = f"const std::vector& {name}" - inputs_args_definition_list[pos] = arg_str - inputs_args_declaration_list[pos] = arg_str +NODE_CC_FILE_TEMPLATE = \ +""" +#include "glog/logging.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/api/backward/backward_api.h" +#include "paddle/phi/api/backward/sparse_bw_api.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" - for name, atype, default_val, pos in forward_attrs_list: - inputs_call_list[pos] = name - if default_val is not None: - inputs_args_declaration_list[ - pos] = f"{atype} {name} = {default_val}" - else: - inputs_args_declaration_list[pos] = f"{atype} {name}" - inputs_args_definition_list[pos] = f"{atype} {name}" - - inputs_args_declaration_str = ", ".join(inputs_args_declaration_list) - inputs_args_definition_str = ", ".join(inputs_args_definition_list) - inputs_call_args_str = ", ".join(inputs_call_list) - - # Forward Full Logic - if len(intermediate_outputs) == 0: - function_name = fwd_api_name - else: - function_name = fwd_api_name + "_intermediate" - - if len(namespace) > 0: - forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});" - else: - forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" - - # Get return type list & outputs - num_outputs = len(forward_outputs_position_map.keys()) - len( - intermediate_outputs) - returns_type_list = ["" for i in range(num_outputs)] - returns_list = ["" for i in range(num_outputs)] - for name, (rtype, pos) in forward_outputs_position_map.items(): - if name in intermediate_outputs: - continue - if num_outputs == 1: - returns_list[0] = f"api_result" - else: - # Tuple api_result - returns_list[pos] = f"std::get<{pos}>(api_result)" +#include "paddle/phi/api/include/sparse_api.h" - if IsPlainTensorType(rtype): - returns_type_list[pos] = "paddle::experimental::Tensor" - else: - assert IsVectorTensorType(rtype) - returns_type_list[pos] = "std::vector" - - if num_outputs == 1: - returns_str = returns_list[0] - returns_type_str = returns_type_list[0] - else: - returns_type_str = ", ".join(returns_type_list) - returns_type_str = f"std::tuple<{returns_type_str}>" - returns_str = ", ".join(returns_list) - returns_str = f"std::make_tuple({returns_str})" - - node_creation_str = GenerateNodeCreationCodes( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, forward_call_str, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs, - inplace_map) - - dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);" - - FORWARD_FUNCTION_TEMPLATE = """ -{} {}({}) {{ - {} - {} - - // Returns - return {}; -}} """ - forward_function_name = GetForwardFunctionName(fwd_api_name) - forward_function_str = FORWARD_FUNCTION_TEMPLATE.format( - returns_type_str, forward_function_name, inputs_args_definition_str, - dygraph_event_str, node_creation_str, returns_str) - forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});" - - return forward_function_str, forward_function_declaration_str - - -def CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list): - # fwd_api_name : "" - # forward_inputs_position_map = { "name" : [type, fwd_position] } - # forward_outputs_position_map = { "name" : [type, fwd_position] } - # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list) - num_returns = len(forward_outputs_position_map.keys()) - - final_state_fwd_api_name = "final_state_" + fwd_api_name - core_ops_returns_info[ - final_state_fwd_api_name] = ["" for i in range(num_returns)] - core_ops_args_info[final_state_fwd_api_name] = ["" for i in range(num_args)] - core_ops_args_type_info[ - final_state_fwd_api_name] = ["" for i in range(num_args)] - for name, (ttype, pos) in forward_inputs_position_map.items(): - core_ops_args_info[final_state_fwd_api_name][pos] = name - if IsPlainTensorType(ttype): - core_ops_args_type_info[final_state_fwd_api_name][pos] = "tensor" - else: - assert IsVectorTensorType(ttype) - core_ops_args_type_info[final_state_fwd_api_name][pos] = "list" - - for name, _, _, pos in forward_attrs_list: - core_ops_args_info[final_state_fwd_api_name][pos] = name +NODE_H_FILE_TEMPLATE = \ +""" +#pragma once +#include "paddle/fluid/eager/tensor_wrapper.h" +#include "paddle/fluid/eager/grad_node_info.h" - for name, (ttype, pos) in forward_outputs_position_map.items(): - core_ops_returns_info[final_state_fwd_api_name][pos] = name +{} +""" +FORWARD_CC_FILE_TEMPLATE = \ +""" +#include "paddle/phi/api/lib/dygraph_api.h" +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" +#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" -def GenerateCoreOpInfoDeclaration(): - core_ops_declaration_str = """ - extern std::unordered_map> core_ops_final_state_args_info; - extern std::unordered_map> core_ops_final_state_args_type_info; - extern std::unordered_map> core_ops_final_state_returns_info; +#include "paddle/phi/api/include/sparse_api.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +{} +{} """ - return core_ops_declaration_str +FORWARD_H_FILE_TEMPLATE = \ +""" +#pragma once +#include "glog/logging.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/phi/api/all.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/eager/to_static/run_program_op_func.h" -def GenerateCoreOpInfoDefinition(): +{} +{} +""" - CORE_OPS_INFO_TEMPLATE = """ +CORE_OPS_INFO_TEMPLATE = \ +""" std::unordered_map> core_ops_final_state_args_info = {{ {} }}; @@ -1114,6 +279,38 @@ std::unordered_map> core_ops_final_state_r }}; """ + +CORE_OPS_DECLARATION_TEMPLATE = \ +""" + extern std::unordered_map> core_ops_final_state_args_info; + extern std::unordered_map> core_ops_final_state_args_type_info; + extern std::unordered_map> core_ops_final_state_returns_info; + +""" + +CHECK_INPLACE_TEMPLATE = \ +""" + // Check Inplace + egr::EagerUtils::CheckInplace({}, {}, require_any_grad);\n +""" + +BUMP_INPLACE_VERSION_TEMPLATE = \ +""" + // Bump Inplace Version + {}.bump_inplace_version(); + VLOG(3) << \"Tensor(\" << {}.name() << \") uses Inplace Strategy.\";\n +""" + + +####################### +## Generator Helpers ## +####################### +def GenerateCoreOpInfoDeclaration(): + return CORE_OPS_DECLARATION_TEMPLATE + + +def GenerateCoreOpInfoDefinition(): + op_args_info_list = [] for op_name, arg_list in core_ops_args_info.items(): arg_str = ",".join(["\"" + v + "\"" for v in arg_list]) @@ -1142,68 +339,864 @@ std::unordered_map> core_ops_final_state_r return core_ops_info_definition_str +##################### +## Generator Class ## +##################### +class DygraphSingleFunctionGenerator(FunctionGeneratorBase): + def __init__(self, forward_api_contents, grad_api_contents, namespace): + self.forward_api_contents = forward_api_contents + # Members from Parent: + #self.namespace + #self.forward_api_contents + #self.forward_api_name + #self.orig_forward_inputs_list + #self.orig_forward_attrs_list + #self.orig_forward_returns_list + #self.forward_inputs_position_map + #self.forward_outputs_position_map + #self.optional_inputs + #self.no_need_buffers + #self.intermediate_outputs + #self.inplace_map + FunctionGeneratorBase.__init__(self, forward_api_contents, namespace) + + self.grad_api_contents = grad_api_contents + + # Raw Contents + self.backward_forward_str = "" + self.backward_api_name = "" + + self.forward_attrs_list = [ + ] #[ [attr_name, attr_type, default_value, orig_position], ...] + self.forward_inputs_list = [ + ] #[ [arg_name, arg_type, orig_position], ...] + self.forward_returns_list = [ + ] #[ [ret_name, ret_type, orig_position], ...] + + self.backward_inputs_list = [ + ] #[ [attr_name, attr_type, default_value, orig_position], ...] + self.backward_attrs_list = [ + ] #[ [arg_name, arg_type, orig_position], ...] + self.backward_returns_list = [ + ] #[ [ret_name, ret_type, orig_position], ...] + + # SlotNameMatched Backward Data + self.backward_forward_inputs_map = { + } #{ "name" : [type, is_fwd_input, orig_position] ...} + self.backward_grad_inputs_map = { + } #{ "name" : [type, fwd_position, orig_position] ...} + self.backward_grad_outputs_map = { + } #{ "name" : [type, fwd_position, orig_position] ...} + + # Generated Results + self.forward_definition_str = "" + self.forward_declaration_str = "" + self.node_declaration_str = "" + self.node_definition_str = "" + + def DygraphYamlValidationCheck(self): + forward_api_contents = self.forward_api_contents + grad_api_contents = self.grad_api_contents + + assert 'api' in forward_api_contents.keys() + assert 'args' in forward_api_contents.keys() + assert 'output' in forward_api_contents.keys() + assert 'backward' in forward_api_contents.keys() + + assert 'args' in grad_api_contents.keys() + assert 'output' in grad_api_contents.keys() + assert 'forward' in grad_api_contents.keys() + + def ForwardsValidationCheck(self): + forward_inputs_list = self.forward_inputs_list + forward_attrs_list = self.forward_attrs_list + forward_returns_list = self.forward_returns_list + + orig_forward_inputs_list = self.orig_forward_inputs_list + orig_forward_attrs_list = self.orig_forward_attrs_list + orig_forward_returns_list = self.orig_forward_returns_list + + for i in range(len(forward_inputs_list)): + forward_input_name = forward_inputs_list[i][0] + forward_input_type = forward_inputs_list[i][1] + forward_input_pos = forward_inputs_list[i][2] + orig_input_name = orig_forward_inputs_list[i][0] + orig_input_type = orig_forward_inputs_list[i][1] + orig_input_pos = orig_forward_inputs_list[i][2] + + assert forward_input_type == orig_input_type + assert forward_input_pos == orig_input_pos + + for i in range(len(forward_attrs_list)): + orig_attr_name = orig_forward_attrs_list[i][0] + orig_attr_type = orig_forward_attrs_list[i][1] + orig_attr_default = orig_forward_attrs_list[i][2] + orig_attr_pos = orig_forward_attrs_list[i][3] + forward_attr_name = forward_attrs_list[i][0] + forward_attr_type = forward_attrs_list[i][1] + forward_attr_default = forward_attrs_list[i][2] + forward_attr_pos = forward_attrs_list[i][3] + assert orig_attr_type == forward_attr_type + assert orig_attr_default == forward_attr_default + assert orig_attr_pos == forward_attr_pos + + for i in range(len(forward_returns_list)): + orig_return_type = orig_forward_returns_list[i][1] + orig_return_pos = orig_forward_returns_list[i][2] + forward_return_type = forward_returns_list[i][1] + forward_return_pos = forward_returns_list[i][2] + + assert orig_return_type == forward_return_type + assert orig_return_pos == forward_return_pos + + # Check Order: Inputs, Attributes + max_input_position = -1 + for _, _, pos in forward_inputs_list: + max_input_position = max(max_input_position, pos) + + max_attr_position = -1 + for _, _, _, pos in forward_attrs_list: + assert pos > max_input_position + max_attr_position = max(max_attr_position, pos) + + def BackwardValidationCheck(self): + backward_forward_inputs_map = self.backward_forward_inputs_map + backward_grad_inputs_map = self.backward_grad_inputs_map + backward_attrs_list = self.backward_attrs_list + + # Check Order: TensorWrappers, GradTensors, Attributes + max_fwd_input_position = -1 + for _, (_, _, pos) in backward_forward_inputs_map.items(): + max_fwd_input_position = max(max_fwd_input_position, pos) + + max_grad_tensor_position = -1 + for _, (_, _, pos) in backward_grad_inputs_map.items(): + assert pos > max_fwd_input_position + max_grad_tensor_position = max(max_grad_tensor_position, pos) + + max_attr_position = -1 + for _, _, _, pos in backward_attrs_list: + assert pos > max_grad_tensor_position + max_attr_position = max(max_attr_position, pos) + + def IntermediateValidationCheck(self): + intermediate_outputs = self.intermediate_outputs + forward_returns_list = self.forward_returns_list + """ + Check whether intermediate_outputs are positioned + at the very end of forward_returns_list + """ + intermediate_positions = range( + len(forward_returns_list) - len(intermediate_outputs), + len(forward_returns_list)) + for ret_name, _, pos in forward_returns_list: + if ret_name in intermediate_outputs: + assert pos in intermediate_positions + + def CollectBackwardInfo(self): + forward_api_contents = self.forward_api_contents + grad_api_contents = self.grad_api_contents + + self.backward_api_name = forward_api_contents['backward'] + self.backward_forward_str = grad_api_contents['forward'] + + backward_args_str = grad_api_contents['args'] + backward_returns_str = grad_api_contents['output'] + + self.backward_inputs_list, self.backward_attrs_list, self.backward_returns_list = ParseYamlBackward( + backward_args_str, backward_returns_str) + print("Parsed Backward Inputs List: ", self.backward_inputs_list) + print("Prased Backward Attrs List: ", self.backward_attrs_list) + print("Parsed Backward Returns List: ", self.backward_returns_list) + + def CollectForwardInfoFromBackwardContents(self): + + backward_forward_str = self.backward_forward_str + + self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForwardFromBackward( + backward_forward_str) + + def SlotNameMatching(self): + backward_inputs_list = self.backward_inputs_list + backward_returns_list = self.backward_returns_list + forward_inputs_position_map = self.forward_inputs_position_map + forward_outputs_position_map = self.forward_outputs_position_map + + for backward_input in backward_inputs_list: + backward_input_name = backward_input[0] + backward_input_type = backward_input[1] + backward_input_pos = backward_input[2] + + backward_fwd_name = FindForwardName(backward_input_name) + if backward_fwd_name: + # Grad Input + assert backward_fwd_name in forward_outputs_position_map.keys() + matched_forward_output_type = forward_outputs_position_map[ + backward_fwd_name][0] + matched_forward_output_pos = forward_outputs_position_map[ + backward_fwd_name][1] + + self.backward_grad_inputs_map[backward_input_name] = [ + backward_input_type, matched_forward_output_pos, + backward_input_pos + ] + else: + # TensorWrapper Input + if backward_input_name in forward_inputs_position_map.keys(): + tensor_wrapper_type = forward_inputs_position_map[ + backward_input_name][0] + self.backward_forward_inputs_map[backward_input_name] = [ + backward_input_type, True, backward_input_pos + ] + + elif backward_input_name in forward_outputs_position_map.keys(): + tensor_wrapper_type = forward_outputs_position_map[ + backward_input_name][0] + self.backward_forward_inputs_map[backward_input_name] = [ + backward_input_type, False, backward_input_pos + ] + else: + assert False, backward_input_name + + for backward_output in backward_returns_list: + backward_output_name = backward_output[0] + backward_output_type = backward_output[1] + backward_output_pos = backward_output[2] + + backward_fwd_name = FindForwardName(backward_output_name) + assert backward_fwd_name is not None + assert backward_fwd_name in forward_inputs_position_map.keys( + ), f"Unable to find {backward_fwd_name} in forward inputs" + + matched_forward_input_type = forward_inputs_position_map[ + backward_fwd_name][0] + matched_forward_input_pos = forward_inputs_position_map[ + backward_fwd_name][1] + + self.backward_grad_outputs_map[backward_output_name] = [ + backward_output_type, matched_forward_input_pos, + backward_output_pos + ] + print("Generated Backward Fwd Input Map: ", + self.backward_forward_inputs_map) + print("Generated Backward Grad Input Map: ", + self.backward_grad_inputs_map) + print("Generated Backward Grad Output Map: ", + self.backward_grad_outputs_map) + + def GenerateNodeDeclaration(self): + forward_op_name = self.forward_api_name + backward_forward_inputs_map = self.backward_forward_inputs_map + backward_attrs_list = self.backward_attrs_list + no_need_buffers = self.no_need_buffers + + # SetTensorWrapper Methods & TensorWrapper Members + set_tensor_wrapper_methods_str = "" + tensor_wrapper_members_str = "" + clear_tensor_wrapper_str = "" + for tname, (ttype, is_fwd_input, + _) in backward_forward_inputs_map.items(): + no_need_buffer = "true" if tname in no_need_buffers else "false" + tensor_wrapper_name = GetSavedName(tname) + if IsPlainTensorType(ttype): + set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format( + tname, tname, tensor_wrapper_name, tname, no_need_buffer) + + tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format( + tensor_wrapper_name) + + clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPER_TEMPLATE.format( + tensor_wrapper_name) + + else: + assert IsVectorTensorType(ttype) + set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format( + tname, tname, tname, tensor_wrapper_name, no_need_buffer) + + tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format( + tensor_wrapper_name) + + clear_tensor_wrapper_str += CLEAR_VECTOR_TENSOR_WRAPPERS_TEMPLATE.format( + tensor_wrapper_name) + + # SetAttributes & Attribute Members + set_attribute_methods_str = "" + attribute_members_str = "" + for aname, atype, default_val, _ in backward_attrs_list: + saved_attr_name = GetSavedName(aname) + set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format( + aname, GetConstReference(atype), aname, saved_attr_name, aname) + + if default_val: + attribute_members_str += ATTRIBUTE_MEMBER_WITH_DEFAULT_TEMPLATE.format( + RemoveConstAndReference(atype), saved_attr_name, + default_val) + else: + attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( + RemoveConstAndReference(atype), saved_attr_name) + + grad_node_name = GetGradNodeName(forward_op_name) + self.node_declaration_str = NODE_DECLARATION_TEMPLATE.format( + grad_node_name, grad_node_name, grad_node_name, grad_node_name, + grad_node_name, clear_tensor_wrapper_str, + set_tensor_wrapper_methods_str, set_attribute_methods_str, + tensor_wrapper_members_str, attribute_members_str) + + print("Generated Node Declaration: ", self.node_declaration_str) + + def GenerateNodeDefinition(self): + namespace = self.namespace + forward_api_name = self.forward_api_name + backward_api_name = self.backward_api_name + backward_forward_inputs_map = self.backward_forward_inputs_map + backward_grad_inputs_map = self.backward_grad_inputs_map + backward_grad_outputs_map = self.backward_grad_outputs_map + backward_attrs_list = self.backward_attrs_list + + # Construct grad_api function args + # Order: TensorWrappers, GradTensors, Attributes + grad_api_args_len = len(backward_forward_inputs_map.keys()) + len( + backward_grad_inputs_map.keys()) + len(backward_attrs_list) + grad_api_args = ["" for i in range(grad_api_args_len)] + for name, (_, is_fwd_input, + grad_api_position), in backward_forward_inputs_map.items(): + tensor_wrapper_name = GetSavedName(name) + grad_api_args[ + grad_api_position] = f"egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, nullptr)" + + for _, (ttype, fwd_position, + grad_api_position) in backward_grad_inputs_map.items(): + if IsPlainTensorType(ttype): + grad_api_args[ + grad_api_position] = f"hooked_grads[{fwd_position}][0]" + else: + assert IsVectorTensorType(ttype) + grad_api_args[ + grad_api_position] = f"hooked_grads[{fwd_position}]" + + for name, _, _, grad_api_position in backward_attrs_list: + saved_attribute_name = GetSavedName(name) + grad_api_args[grad_api_position] = f"this->{saved_attribute_name}" + grad_api_args_str = ", ".join(grad_api_args) + + # Construct grad_api returns + num_bwd_outputs = len(backward_grad_outputs_map.keys()) + returns_str = f"std::vector> returns({num_bwd_outputs});\n" + for _, (ttype, fwd_position, + grad_api_position) in backward_grad_outputs_map.items(): + # Infer Grad API Return Type + if num_bwd_outputs == 1: + # Single tensor output, return as is + if IsPlainTensorType(ttype): + returns_str += "returns[0] = { grad_api_returns };\n" + else: + assert IsVectorTensorType(ttype) + returns_str += "returns[0] = grad_api_returns;\n" + else: + # Rearrange output order accordingly + returns_str += f"returns[{fwd_position}] = grad_api_returns[{grad_api_position}];\n" + returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n" + returns_str += f"return returns;\n" + + grad_node_name = GetGradNodeName(forward_api_name) + + fill_zero_str = "" + if forward_api_name in ops_to_fill_zero_for_empty_grads: + fill_zero_str = "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n" + + grad_api_namespace = f"paddle::experimental::{namespace}" + + self.node_definition_str = FUNCTION_TEMPLATE.format( + grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace, + backward_api_name, grad_api_args_str, returns_str) + + print("Generated Node Definition: ", self.node_definition_str) + + def GenerateForwardDefinition(self, is_inplaced): + namespace = self.namespace + forward_api_name = GetInplacedFunctionName( + self.forward_api_name) if is_inplaced else self.forward_api_name + backward_api_name = self.backward_api_name + forward_inputs_position_map = self.forward_inputs_position_map + forward_outputs_position_map = self.forward_outputs_position_map + forward_attrs_list = self.forward_attrs_list + backward_forward_inputs_map = self.backward_forward_inputs_map + backward_grad_inputs_map = self.backward_grad_inputs_map + backward_grad_outputs_map = self.backward_grad_outputs_map + backward_attrs_list = self.backward_attrs_list + optional_inputs = self.optional_inputs + intermediate_outputs = self.intermediate_outputs + inplace_map = self.inplace_map + + # Get Function Args + num_inputs = len(forward_attrs_list) + len( + forward_inputs_position_map.keys()) + inputs_args_definition_list = ["" for i in range(num_inputs)] + inputs_args_declaration_list = ["" for i in range(num_inputs)] + inputs_call_list = ["" for i in range(num_inputs)] + for name, (ttype, pos) in forward_inputs_position_map.items(): + inputs_call_list[pos] = f"{name}" + is_optional = (name in optional_inputs) + if IsPlainTensorType(ttype): + if is_optional: + arg_str = f"const paddle::optional& {name}" + else: + if inplace_map and name in inplace_map.keys(): + arg_str = f"paddle::experimental::Tensor& {name}" + else: + arg_str = f"const paddle::experimental::Tensor& {name}" + else: + assert IsVectorTensorType(ttype) + arg_str = f"const std::vector& {name}" + + inputs_args_definition_list[pos] = arg_str + inputs_args_declaration_list[pos] = arg_str + + for name, atype, default_val, pos in forward_attrs_list: + inputs_call_list[pos] = name + if default_val is not None: + inputs_args_declaration_list[ + pos] = f"{atype} {name} = {default_val}" + else: + inputs_args_declaration_list[pos] = f"{atype} {name}" + inputs_args_definition_list[pos] = f"{atype} {name}" + + inputs_args_declaration_str = ", ".join(inputs_args_declaration_list) + inputs_args_definition_str = ", ".join(inputs_args_definition_list) + inputs_call_args_str = ", ".join(inputs_call_list) + + # Forward Full Logic + function_name = forward_api_name + if len(intermediate_outputs) > 0: + function_name = GetIntermediateAPIFunctionName(function_name) + + forward_call_str = f"auto api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});" + + # Get return type list & outputs + num_outputs = len(forward_outputs_position_map.keys()) - len( + intermediate_outputs) + returns_type_list = ["" for i in range(num_outputs)] + returns_list = ["" for i in range(num_outputs)] + for name, (rtype, pos) in forward_outputs_position_map.items(): + if name in intermediate_outputs: + continue + if num_outputs == 1: + returns_list[0] = f"api_result" + else: + # Tuple api_result + returns_list[pos] = f"std::get<{pos}>(api_result)" + + if IsPlainTensorType(rtype): + returns_type_list[pos] = "paddle::experimental::Tensor" + else: + assert IsVectorTensorType(rtype) + returns_type_list[ + pos] = "std::vector" + + if num_outputs == 1: + returns_str = returns_list[0] + returns_type_str = returns_type_list[0] + else: + returns_type_str = ", ".join(returns_type_list) + returns_type_str = f"std::tuple<{returns_type_str}>" + returns_str = ", ".join(returns_list) + returns_str = f"std::make_tuple({returns_str})" + + self.GenerateNodeCreationCodes(forward_call_str) + + node_creation_str = self.node_creation_str + dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{forward_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);" + forward_function_name = GetDygraphForwardFunctionName(forward_api_name) + + self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format( + returns_type_str, forward_function_name, inputs_args_definition_str, + dygraph_event_str, node_creation_str, returns_str) + self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n" + + print("Generated Forward Definition: ", self.forward_definition_str) + print("Generated Forward Declaration: ", self.forward_declaration_str) + + def GenerateNodeCreationCodes(self, forward_call_str): + forward_api_name = self.forward_api_name + forward_inputs_position_map = self.forward_inputs_position_map + forward_outputs_position_map = self.forward_outputs_position_map + forward_attrs_list = self.forward_attrs_list + backward_forward_inputs_map = self.backward_forward_inputs_map + backward_grad_inputs_map = self.backward_grad_inputs_map + backward_grad_outputs_map = self.backward_grad_outputs_map + backward_attrs_list = self.backward_attrs_list + optional_inputs = self.optional_inputs + inplace_map = self.inplace_map + + # Get Input AutoGradMeta + inputs_autograd_meta_list = [] + compute_require_grad_args_list = ["trace_backward"] + for name, (ttype, pos) in forward_inputs_position_map.items(): + input_autograd_meta_name = GetAutoGradMetaName(name) + if IsPlainTensorType(ttype): + input_autograd_meta = f" egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});" + else: + assert IsVectorTensorType(ttype) + input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name) + input_autograd_meta = f" std::vector {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n" + input_autograd_meta += f" std::vector* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};" + + inputs_autograd_meta_list.append(input_autograd_meta) + compute_require_grad_args_list.append(input_autograd_meta_name) + inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list) + compute_require_grad_args_str = ",".join(compute_require_grad_args_list) + + # Get Output AutoGradMeta + outputs_autograd_meta_list = [] + pass_stop_gradient_args_list = ["false"] + num_fwd_outputs = len(forward_outputs_position_map.keys()) + for name, (rtype, pos) in forward_outputs_position_map.items(): + output_autograd_meta_name = GetAutoGradMetaName(name) + output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name) + if num_fwd_outputs == 1: + if IsPlainTensorType(rtype): + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);" + else: + assert IsVectorTensorType(rtype) + output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n" + output_autograd_meta += f" std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};" + else: + # Tuple api_result + if IsPlainTensorType(rtype): + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));" + else: + assert IsVectorTensorType(rtype) + output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n" + output_autograd_meta += f" std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};" + + outputs_autograd_meta_list.append(output_autograd_meta) + pass_stop_gradient_args_list.append(output_autograd_meta_name) + + # ComputeRequireGrad & PassStopGradient + outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list) + pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list) + + # Check Inplace + check_inplace_str = "" + bump_inplace_version_str = "" + for inplace_name in inplace_map.keys(): + inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name) + check_inplace_str += CHECK_INPLACE_TEMPLATE.format( + inplace_name, inplace_autograd_meta_name) + bump_inplace_version_str += BUMP_INPLACE_VERSION_TEMPLATE.format( + inplace_name, inplace_name) + + # Node Construction + num_backward_inputs = len(backward_grad_inputs_map.keys()) + num_backward_outputs = len(backward_grad_outputs_map.keys()) + grad_node_name = GetGradNodeName(forward_api_name) + + node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_backward_inputs}, {num_backward_outputs});" + + # SetAttributes + set_attributes_list = [] + forward_attrs_name_set = set() + for name, _, _, _ in forward_attrs_list: + forward_attrs_name_set.add(name) + + for name, _, default_val_attr, _ in backward_attrs_list: + if name in forward_attrs_name_set: + set_attributes = f" grad_node->SetAttribute{name}({name});" + else: + set_attributes = f" grad_node->SetAttribute{name}({default_val_attr});" + set_attributes_list.append(set_attributes) + set_attributes_str = "\n".join(set_attributes_list) + + # SetTensorWrappers + set_tensor_wrappers_list = [] + for name, (atype, is_fwd_input, + pos) in backward_forward_inputs_map.items(): + is_optional = (name in optional_inputs) + + if is_fwd_input: + if is_optional: + set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" + else: + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" + else: + if num_fwd_outputs > 1: + # Aligned with forward output position + assert name in forward_outputs_position_map.keys() + fwd_output_pos = forward_outputs_position_map[name][1] + tw_name = f"std::get<{fwd_output_pos}>(api_result)" + else: + tw_name = f"api_result" + + if is_optional: + set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);" + else: + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);" + set_tensor_wrappers_list.append(set_tensor_wrappers) + set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) + + # SetGradOutMeta & SetEdges + set_grad_out_meta_list = [] + set_edges_list = [] + for name, (_, pos) in forward_inputs_position_map.items(): + input_autograd_meta_name = GetAutoGradMetaName(name) + set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});" + set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});" + set_grad_out_meta_list.append(set_grad_out_meta) + set_edges_list.append(set_edges) + set_grad_out_meta_str = "\n".join(set_grad_out_meta_list) + set_edges_str = "\n".join(set_edges_list) + + # SetOutRank & SetHistory & SetGradInMeta + set_out_rank_list = [] + set_history_list = [] + set_grad_in_meta_list = [] + set_retain_grad_list = [] + num_outputs = len(forward_outputs_position_map.keys()) + for name, (_, pos) in forward_outputs_position_map.items(): + output_autograd_meta_name = GetAutoGradMetaName(name) + set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});" + set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);" + + if num_outputs == 1: + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" + set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});" + else: + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));" + set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});" + set_out_rank_list.append(set_out_rank) + set_history_list.append(set_history) + set_grad_in_meta_list.append(set_grad_in_meta) + set_retain_grad_list.append(set_retain_grad) + + set_out_rank_str = "\n".join(set_out_rank_list) + set_history_str = "\n".join(set_history_list) + set_grad_in_meta_str = "\n".join(set_grad_in_meta_list) + set_retain_grad_str = "\n".join(set_retain_grad_list) + + node_event_name = forward_api_name + " node_creation" + node_creation_event_str = f"paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::Operator, 1);\n" + + self.node_creation_str = NODE_CREATION_TEMPLATE.format( + inputs_autograd_meta_str, compute_require_grad_args_str, + check_inplace_str, forward_call_str, bump_inplace_version_str, + node_creation_event_str, outputs_autograd_meta_str, + pass_stop_gradient_args_str, node_construction_str, + set_attributes_str, set_tensor_wrappers_str, set_grad_out_meta_str, + set_edges_str, set_out_rank_str, set_history_str, + set_grad_in_meta_str, set_retain_grad_str) + + def GenerateInplacedForwardDygraphFunctions(self): + # Inplaced Version Dygraph Function Generation + forward_api_name = self.forward_api_name + forward_api_contents = self.forward_api_contents + + if forward_api_name != "sum" and "inplace" in forward_api_contents.keys( + ): + # Node Definition Generation + self.GenerateForwardDefinition(is_inplaced=True) + self.UpdateCoreOpsInformation(is_inplaced=True) + + def UpdateCoreOpsInformation(self, is_inplaced): + forward_api_name = GetInplacedFunctionName( + self.forward_api_name) if is_inplaced else self.forward_api_name + forward_inputs_position_map = self.forward_inputs_position_map + forward_outputs_position_map = self.forward_outputs_position_map + forward_attrs_list = self.forward_attrs_list + + num_args = len(forward_inputs_position_map.keys()) + len( + forward_attrs_list) + num_returns = len(forward_outputs_position_map.keys()) + + final_state_fwd_api_name = "final_state_" + forward_api_name + core_ops_returns_info[ + final_state_fwd_api_name] = ["" for i in range(num_returns)] + core_ops_args_info[ + final_state_fwd_api_name] = ["" for i in range(num_args)] + core_ops_args_type_info[ + final_state_fwd_api_name] = ["" for i in range(num_args)] + for name, (ttype, pos) in forward_inputs_position_map.items(): + core_ops_args_info[final_state_fwd_api_name][pos] = name + if IsPlainTensorType(ttype): + core_ops_args_type_info[final_state_fwd_api_name][ + pos] = "tensor" + else: + assert IsVectorTensorType(ttype) + core_ops_args_type_info[final_state_fwd_api_name][pos] = "list" + + for name, _, _, pos in forward_attrs_list: + core_ops_args_info[final_state_fwd_api_name][pos] = name + + for name, (ttype, pos) in forward_outputs_position_map.items(): + core_ops_returns_info[final_state_fwd_api_name][pos] = name + + def run(self): + # Basic Validation Check + self.DygraphYamlValidationCheck() + + ########################## + ## Parsing Raw Contents ## + ########################## + # Parse inplace_map + self.ParseInplaceInfo() + + # Parse no_need_buffer + self.ParseNoNeedBuffer() + + # Parse optional_inputs + self.ParseDispensable() + + # Parse intermediate_outputs + self.ParseIntermediate() + self.IntermediateValidationCheck() + + # Initialize backward_forward_str, backward_inputs_list, backward_attrs_list, backward_returns_list + self.CollectBackwardInfo() + + # Initialize forward_inputs_list, forward_attrs_list, forward_returns_list + self.CollectForwardInfoFromBackwardContents() + + # Initialize orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list + self.CollectOriginalForwardInfo() + + # Forwards Validation Check + self.ForwardsValidationCheck() + + ############################# + ## Process Parsed Contents ## + ############################# + # Initialize forward_inputs_position_map, forward_outputs_position_map + self.DetermineForwardPositionMap(self.forward_inputs_list, + self.forward_returns_list) + + # Initialize forward_inputs_position_map, forward_outputs_position_map + self.SlotNameMatching() + + # Backward Validation Check + self.BackwardValidationCheck() + + ##################### + ## Code Generation ## + ##################### + self.GenerateNodeDeclaration() + self.GenerateNodeDefinition() + self.GenerateForwardDefinition(is_inplaced=False) + + self.UpdateCoreOpsInformation(is_inplaced=False) + + self.GenerateInplacedForwardDygraphFunctions() + + +class DygraphYamlGenerator(YamlGeneratorBase): + def __init__(self, api_yaml_path, backward_yaml_path): + # Parent members: + # self.namespace + # self.api_yaml_path + # self.forward_api_list + YamlGeneratorBase.__init__(self, api_yaml_path) + + self.backward_yaml_path = backward_yaml_path + self.grad_api_dict = {} + + self.forward_definition_str = "" + self.forward_declaration_str = "" + self.node_declaration_str = "" + self.node_definition_str = "" + + def ParseYamlContents(self): + self.ParseForwardYamlContents() + + backward_yaml_path = self.backward_yaml_path + self.grad_api_dict = ReadBwdFile(backward_yaml_path) + + def GetBackwardAPIContents(self, forward_api_contents): + grad_api_dict = self.grad_api_dict + + if 'backward' not in forward_api_contents.keys(): return None + + backward_api_name = forward_api_contents['backward'] + assert backward_api_name in grad_api_dict.keys() + backward_api_contents = grad_api_dict[backward_api_name] + + return backward_api_contents + + def GenerateCode(self): + forward_api_list = self.forward_api_list + grad_api_dict = self.grad_api_dict + namespace = self.namespace + + for forward_api_contents in forward_api_list: + backward_api_contents = self.GetBackwardAPIContents( + forward_api_contents) + if backward_api_contents is None: continue + + d_generator = DygraphSingleFunctionGenerator( + forward_api_contents, backward_api_contents, namespace) + d_generator.run() + + self.forward_definition_str += d_generator.forward_definition_str + "\n" + self.forward_declaration_str += d_generator.forward_declaration_str + "\n" + self.node_declaration_str += d_generator.node_declaration_str + "\n" + self.node_definition_str += d_generator.node_definition_str + "\n" + + if len(namespace) > 0: + if namespace.endswith("::"): + namespace = namespace[:-2] + self.forward_definition_str = NAMESPACE_WRAPPER_TEMPLATE.format( + namespace, self.forward_definition_str) + self.forward_declaration_str = NAMESPACE_WRAPPER_TEMPLATE.format( + namespace, self.forward_declaration_str) + self.node_declaration_str = NAMESPACE_WRAPPER_TEMPLATE.format( + namespace, self.node_declaration_str) + self.node_definition_str = NAMESPACE_WRAPPER_TEMPLATE.format( + namespace, self.node_definition_str) + + def run(self): + self.ParseYamlContents() + + self.InferNameSpace() + + self.GenerateCode() + + +################## +## File Writers ## +################## def GenerateNodeCCFile(filepath, node_definition_str): - file_contents = """ -#include "glog/logging.h" -#include "paddle/phi/api/all.h" -#include "paddle/phi/api/backward/backward_api.h" -#include "paddle/fluid/imperative/tracer.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/eager/utils.h" -#include "paddle/fluid/eager/api/utils/global_utils.h" -#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" -#include "paddle/fluid/eager/to_static/run_program_op_node.h" + if os.path.exists(filepath): + os.remove(filepath) -#include "paddle/phi/api/backward/sparse_bw_api.h" -""" - file_contents += node_definition_str + file_contents = NODE_CC_FILE_TEMPLATE.format(node_definition_str) with open(filepath, 'a') as f: f.write(file_contents) def GenerateNodeHFile(filepath, node_declaration_str): - file_contents = """ -#pragma once -#include "paddle/fluid/eager/tensor_wrapper.h" -#include "paddle/fluid/eager/grad_node_info.h" + if os.path.exists(filepath): + os.remove(filepath) -""" - file_contents += node_declaration_str + file_contents = NODE_H_FILE_TEMPLATE.format(node_declaration_str) with open(filepath, 'a') as f: f.write(file_contents) def GenerateForwardCCFile(filepath, forward_definition_str): - file_contents = """ -#include "paddle/phi/api/lib/dygraph_api.h" -#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" -#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" - -#include "paddle/phi/api/include/sparse_api.h" -#include "paddle/fluid/eager/api/utils/global_utils.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -""" + if os.path.exists(filepath): + os.remove(filepath) - file_contents += GenerateCoreOpInfoDefinition() - file_contents += forward_definition_str + core_ops_info_str = GenerateCoreOpInfoDefinition() + file_contents = FORWARD_CC_FILE_TEMPLATE.format(core_ops_info_str, + forward_definition_str) with open(filepath, 'a') as f: f.write(file_contents) def GenerateForwardHFile(filepath, forward_function_declaration_str): - file_contents = """ -#pragma once -#include "glog/logging.h" -#include "paddle/fluid/eager/autograd_meta.h" -#include "paddle/phi/api/all.h" -#include "paddle/fluid/eager/utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/eager/to_static/run_program_op_func.h" + if os.path.exists(filepath): + os.remove(filepath) -""" - file_contents += GenerateCoreOpInfoDeclaration() - file_contents += forward_function_declaration_str + core_ops_info_str = GenerateCoreOpInfoDeclaration() + file_contents = FORWARD_H_FILE_TEMPLATE.format( + core_ops_info_str, forward_function_declaration_str) with open(filepath, 'a') as f: f.write(file_contents) @@ -1224,199 +1217,13 @@ if __name__ == "__main__": api_yaml_path = api_yaml_paths[i] backward_yaml_path = backward_yaml_paths[i] - if "sparse" in api_yaml_path: - assert "sparse" in backward_yaml_path - namespace = "sparse" - else: - namespace = "" - - fwd_api_list = ReadFwdFile(api_yaml_path) - grad_api_dict = ReadBwdFile(backward_yaml_path) - - yaml_forward_definition_str = "" - yaml_forward_declaration_str = "" - yaml_node_declaration_str = "" - yaml_node_definition_str = "" - for fwd_api in fwd_api_list: - # We only generate Ops with grad - if 'backward' not in fwd_api.keys(): - continue + generator = DygraphYamlGenerator(api_yaml_path, backward_yaml_path) + generator.run() - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() - - no_need_buffer_set = set() - if 'no_need_buffer' in fwd_api.keys(): - no_need_buffer_set = ParseNoNeedBuffer(fwd_api[ - 'no_need_buffer']) - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - inplace_map = {} - if 'inplace' in fwd_api.keys(): - inplace_map = ParseInplaceInfo(fwd_api['inplace']) - - bwd_api_name = fwd_api['backward'] - assert bwd_api_name in grad_api_dict.keys(), bwd_api_name - bwd_api = grad_api_dict[bwd_api_name] - - assert 'args' in bwd_api.keys() - assert 'output' in bwd_api.keys() - assert 'forward' in bwd_api.keys() - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - bwd_forward_str = bwd_api['forward'] - bwd_args_str = bwd_api['args'] - bwd_returns_str = bwd_api['output'] - - # Collect Forward Inputs/Outputs - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( - bwd_forward_str) - print("Parsed Forward Inputs List: ", forward_inputs_list) - print("Prased Forward Attrs List: ", forward_attrs_list) - print("Parsed Forward Returns List: ", forward_returns_list) - - intermediate_outputs = [] - if 'intermediate' in fwd_api.keys(): - intermediate_outputs = ParseIntermediate(fwd_api[ - 'intermediate']) - - IntermediateValidationCheck(intermediate_outputs, - forward_returns_list) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", - orig_forward_inputs_list) - print("Prased Original Forward Attrs List: ", - orig_forward_attrs_list) - print("Parsed Original Forward Returns List: ", - orig_forward_returns_list) - - # Forward Validation Checks - ForwardsValidationCheck( - forward_inputs_list, forward_attrs_list, forward_returns_list, - orig_forward_inputs_list, orig_forward_attrs_list, - orig_forward_returns_list) - - # Parse Backward Inputs/Outputs - backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( - bwd_args_str, bwd_returns_str) - print("Parsed Backward Inputs List: ", backward_inputs_list) - print("Prased Backward Attrs List: ", backward_attrs_list) - print("Parsed Backward Returns List: ", backward_returns_list) - - # Determine Forward Inputs/Outputs Position - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - # SlotName Matching - backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( - backward_inputs_list, backward_returns_list, - forward_inputs_position_map, forward_outputs_position_map) - print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) - print("Generated Backward Grad Input Map: ", - backward_grad_input_map) - print("Generated Backward Grad Output Map: ", - backward_grad_output_map) - - # Backward Validation Check - BackwardValidationCheck(backward_fwd_input_map, - backward_grad_input_map, - backward_attrs_list) - - # Node Declaration Generation - yaml_node_declaration_str += GenerateNodeDeclaration( - fwd_api_name, backward_fwd_input_map, backward_attrs_list, - no_need_buffer_set) - print("Generated Node Declaration: ", node_declaration_str) - - yaml_node_definition_str += GenerateNodeDefinition( - fwd_api_name, bwd_api_name, backward_fwd_input_map, - backward_grad_input_map, backward_grad_output_map, - backward_attrs_list) - print("Generated Node Definition: ", node_definition_str) - - # Node Definition Generation - definition_declaration_pair = GenerateForwardDefinition( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, orig_forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs, - intermediate_outputs, {}) - print("Generated Forward Definition: ", forward_definition_str) - print("Generated Forward Declaration: ", forward_declaration_str) - yaml_forward_definition_str += definition_declaration_pair[0] - yaml_forward_declaration_str += definition_declaration_pair[1] - - # For python-level API dispatch - CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, - orig_forward_attrs_list) - - # Inplaced Version Dygraph Function Generation - if fwd_api_name != "sum" and "inplace" in fwd_api.keys(): - fwd_api_name_inplaced = GetInplacedFunctionName(fwd_api_name) - - # Node Definition Generation - definition_declaration_pair = GenerateForwardDefinition( - fwd_api_name_inplaced, bwd_api_name, - forward_inputs_position_map, forward_outputs_position_map, - forward_attrs_list, backward_fwd_input_map, - backward_grad_input_map, backward_grad_output_map, - backward_attrs_list, optional_inputs, intermediate_outputs, - inplace_map) - print("Generated Inplaced Forward Definition: ", - forward_definition_str) - print("Generated Inplaced Forward Declaration: ", - forward_declaration_str) - forward_definition_str += definition_declaration_pair[0] - forward_declaration_str += definition_declaration_pair[1] - - # For python-level API dispatch - CollectCoreOpsInformation( - fwd_api_name_inplaced, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list) - - if len(namespace) > 0: - forward_definition_str += f"""namespace {namespace} {{ - {yaml_forward_definition_str} -}} -""" - - forward_declaration_str += f"""namespace {namespace} {{ - {yaml_forward_declaration_str} -}} -""" - - node_declaration_str += f"""namespace {namespace} {{ - {yaml_node_declaration_str} -}} -""" - - node_definition_str += f"""namespace {namespace} {{ - {yaml_node_definition_str} -}} -""" - - else: - forward_definition_str += yaml_forward_definition_str - forward_declaration_str += yaml_forward_declaration_str - node_declaration_str += yaml_node_declaration_str - node_definition_str += yaml_node_definition_str + node_declaration_str += generator.node_declaration_str + "\n" + node_definition_str += generator.node_definition_str + "\n" + forward_definition_str += generator.forward_definition_str + "\n" + forward_declaration_str += generator.forward_declaration_str + "\n" # Generate Files nodes_h_path = args.nodes_h_path @@ -1424,12 +1231,6 @@ if __name__ == "__main__": forwards_h_path = args.forwards_h_path forwards_cc_path = args.forwards_cc_path - for path in [ - nodes_cc_path, nodes_h_path, forwards_h_path, forwards_cc_path - ]: - if os.path.exists(path): - os.remove(path) - GenerateNodeCCFile(nodes_cc_path, node_definition_str) GenerateNodeHFile(nodes_h_path, node_declaration_str) GenerateForwardCCFile(forwards_cc_path, forward_definition_str) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 5a732212a5649b6899394b80399d3af32707197d..c7be9480f557d94d2c078c3832d996f30c52d730 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -15,7 +15,10 @@ import os import argparse import logging -from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap, GetInplacedFunctionName, ParseInplaceInfo +from codegen_utils import FunctionGeneratorBase, YamlGeneratorBase +from codegen_utils import yaml_types_mapping +from codegen_utils import ReadFwdFile, IsVectorTensorType, GetForwardFunctionName +from codegen_utils import ParseYamlForward, GetInplacedFunctionName ########################### ## Global Configurations ## @@ -121,7 +124,10 @@ FUNCTION_NAME_TEMPLATE = \ PYTHON_C_FUNCTION_REG_TEMPLATE = \ -"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}" +""" +{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}} + +""" PYTHON_C_WRAPPER_TEMPLATE = \ @@ -229,77 +235,39 @@ NAMESPACE_WRAPPER_TEMPLATE = \ ####################### ## Generator Classes ## ####################### -class PythonCSingleFunctionGenerator: - def __init__(self, fwd_api_contents, namespace): - self.fwd_api_contents = fwd_api_contents - self.namespace = namespace - - # Raw Contents - self.forward_api_name = "" - self.forward_args_str = "" - self.forward_returns_str = "" - - # Raw Data - self.forward_attrs_list = None #[ [attr_name, attr_type, default_value, orig_position], ...] - self.forward_inputs_list = None #[ [arg_name, arg_type, orig_position], ...] - self.forward_returns_list = None #[ [ret_name, ret_type, orig_position], ...] - - # Processed Data - self.forward_inputs_position_map = None #{ "name" : [type, fwd_position] } - self.forward_outputs_position_map = None #{ "name" : [type, fwd_position] } - - # Special Op Attributes - self.optional_inputs = [] #[name, ...] +class PythonCSingleFunctionGenerator(FunctionGeneratorBase): + def __init__(self, forward_api_contents, namespace): + # Members from Parent: + #self.namespace + #self.forward_api_contents + #self.forward_api_name + #self.orig_forward_inputs_list + #self.orig_forward_attrs_list + #self.orig_forward_returns_list + #self.forward_inputs_position_map + #self.forward_outputs_position_map + #self.optional_inputs + #self.no_need_buffers + #self.intermediate_outputs + #self.inplace_map + FunctionGeneratorBase.__init__(self, forward_api_contents, namespace) + self.is_forward_only = True # Generated Results self.python_c_function_str = "" self.python_c_function_reg_str = "" - def CollectRawContents(self): - fwd_api_contents = self.fwd_api_contents - - assert 'api' in fwd_api_contents.keys( - ), "Unable to find \"api\" in fwd_api_contents keys" - assert 'args' in fwd_api_contents.keys( - ), "Unable to find \"args\" in fwd_api_contents keys" - assert 'output' in fwd_api_contents.keys( - ), "Unable to find \"output\" in fwd_api_contents keys" - - self.forward_api_name = fwd_api_contents['api'] - self.forward_args_str = fwd_api_contents['args'] - self.forward_returns_str = fwd_api_contents['output'] - def CollectIsForwardOnly(self): - fwd_api_contents = self.fwd_api_contents - self.is_forward_only = False if 'backward' in fwd_api_contents.keys( + forward_api_contents = self.forward_api_contents + self.is_forward_only = False if 'backward' in forward_api_contents.keys( ) else True - def CollectOptionalInputs(self): - fwd_api_contents = self.fwd_api_contents - if 'optional' in fwd_api_contents.keys(): - self.optional_inputs = ParseDispensable(fwd_api_contents[ - 'optional']) - - def CollectForwardInOutAttr(self): - forward_args_str = self.forward_args_str - forward_returns_str = self.forward_returns_str - - self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward( - forward_args_str, forward_returns_str) - - def CollectForwardPositionMap(self): - forward_inputs_list = self.forward_inputs_list - forward_returns_list = self.forward_returns_list - - self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - - def GeneratePythonCFunction(self, inplace_map): + def GeneratePythonCFunction(self): namespace = self.namespace - forward_api_name = GetInplacedFunctionName( - self.forward_api_name) if inplace_map else self.forward_api_name - forward_attrs_list = self.forward_attrs_list + inplace_map = self.inplace_map + forward_api_name = self.forward_api_name + orig_forward_attrs_list = self.orig_forward_attrs_list forward_inputs_position_map = self.forward_inputs_position_map forward_outputs_position_map = self.forward_outputs_position_map optional_inputs = self.optional_inputs @@ -326,7 +294,7 @@ class PythonCSingleFunctionGenerator: parse_attributes_str = "" # Generate Python-C Attributes Parsing Logic - for name, atype, _, pos in forward_attrs_list: + for name, atype, _, pos in orig_forward_attrs_list: parsing_function_name = FindParsingFunctionFromAttributeType(atype) parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format( name, pos, atype, name, parsing_function_name, name, @@ -334,11 +302,11 @@ class PythonCSingleFunctionGenerator: # Generate Dygraph Function Call Logic num_args = len(forward_inputs_position_map.keys()) + len( - forward_attrs_list) + orig_forward_attrs_list) dygraph_function_call_list = ["" for i in range(num_args)] for name, (_, pos) in forward_inputs_position_map.items(): dygraph_function_call_list[pos] = f"{name}" - for name, _, _, pos in forward_attrs_list: + for name, _, _, pos in orig_forward_attrs_list: dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) @@ -350,17 +318,7 @@ class PythonCSingleFunctionGenerator: fwd_function_name = FUNCTION_NAME_TEMPLATE.format( "::", namespace, GetForwardFunctionName(forward_api_name)) - if inplace_map: - assert len( - inplace_map - ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}" - for inplace_input, inplace_output in inplace_map.items(): - return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format( - forward_api_name, inplace_input, forward_api_name, - inplace_output) - break - else: - return_str = " return ToPyObject(out);" + return_str = " return ToPyObject(out);" # Generate Record Event for performance profiling pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format( @@ -374,29 +332,56 @@ class PythonCSingleFunctionGenerator: self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format( forward_api_name, namespace, forward_api_name, forward_api_name) - def run(self, inplace_map): + if len(inplace_map) > 0: + inplaced_forward_api_name = GetInplacedFunctionName( + self.forward_api_name) + assert len( + inplace_map + ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}" + for inplace_input, inplace_output in inplace_map.items(): + return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format( + inplaced_forward_api_name, inplace_input, + inplaced_forward_api_name, inplace_output) + break + + self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format( + inplaced_forward_api_name, pythonc_record_event_str, + inplaced_forward_api_name, get_eager_tensor_str, + parse_attributes_str, fwd_function_name, + dygraph_function_call_str, return_str) + + # Generate Python-C Function Registration + self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format( + inplaced_forward_api_name, namespace, inplaced_forward_api_name, + inplaced_forward_api_name) + + def run(self): # Initialized is_forward_only self.CollectIsForwardOnly() - # Initialized forward_api_name, forward_args_str, forward_returns_str - self.CollectRawContents() - if SkipAPIGeneration(self.forward_api_name): return False - # Initialized optional_inputs - self.CollectOptionalInputs() + self.ParseDispensable() + + # Initialized inplace_map + self.ParseInplaceInfo() - # Initialized forward_inputs_list, forward_returns_list, forward_attrs_list - self.CollectForwardInOutAttr() + # Initialized orig_forward_inputs_list, orig_forward_returns_list, orig_forward_attrs_list + self.CollectOriginalForwardInfo() logging.info( - f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}") + f"Parsed Original Forward Inputs List: \n{self.orig_forward_inputs_list}" + ) logging.info( - f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}") + f"Prased Original Forward Attrs List: \n{self.orig_forward_attrs_list}" + ) logging.info( - f"Parsed Original Forward Returns List: \n{self.forward_returns_list}" + f"Parsed Original Forward Returns List: \n{self.orig_forward_returns_list}" ) + if SkipAPIGeneration(self.forward_api_name): return False + # Initialized forward_inputs_position_map, forward_outputs_position_map - self.CollectForwardPositionMap() + self.DetermineForwardPositionMap(self.orig_forward_inputs_list, + self.orig_forward_returns_list) logging.info( f"Generated Forward Input Position Map: {self.forward_inputs_position_map}" ) @@ -405,7 +390,7 @@ class PythonCSingleFunctionGenerator: ) # Code Generation - self.GeneratePythonCFunction(inplace_map) + self.GeneratePythonCFunction() logging.info( f"Generated Python-C Function: {self.python_c_function_str}") logging.info( @@ -415,21 +400,18 @@ class PythonCSingleFunctionGenerator: return True -class PythonCYamlGenerator: +class PythonCYamlGenerator(YamlGeneratorBase): def __init__(self, path): - self.yaml_path = path - - self.namespace = "" - self.forward_api_list = [] + # Parent members: + # self.namespace + # self.api_yaml_path + # self.forward_api_list + YamlGeneratorBase.__init__(self, api_yaml_path) # Generated Result self.python_c_functions_reg_str = "" self.python_c_functions_str = "" - def ParseYamlContents(self): - yaml_path = self.yaml_path - self.forward_api_list = ReadFwdFile(yaml_path) - def GeneratePythonCFunctions(self): namespace = self.namespace forward_api_list = self.forward_api_list @@ -437,28 +419,12 @@ class PythonCYamlGenerator: for forward_api_content in forward_api_list: f_generator = PythonCSingleFunctionGenerator(forward_api_content, namespace) - status = f_generator.run({}) + status = f_generator.run() if status == True: self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n" self.python_c_functions_str += f_generator.python_c_function_str + "\n" - if 'inplace' in forward_api_content.keys(): - inplace_map = ParseInplaceInfo(forward_api_content['inplace']) - - f_generator_inplace = PythonCSingleFunctionGenerator( - forward_api_content, namespace) - status = f_generator_inplace.run(inplace_map) - - if status == True: - self.python_c_functions_reg_str += f_generator_inplace.python_c_function_reg_str + ",\n" - self.python_c_functions_str += f_generator_inplace.python_c_function_str + "\n" - - def InferNameSpace(self): - yaml_path = self.yaml_path - if "sparse" in yaml_path: - self.namespace = "sparse::" - def AttachNamespace(self): namespace = self.namespace python_c_functions_str = self.python_c_functions_str @@ -474,7 +440,7 @@ class PythonCYamlGenerator: self.InferNameSpace() # Read Yaml file - self.ParseYamlContents() + self.ParseForwardYamlContents() # Code Generation self.GeneratePythonCFunctions() diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index c83e16e9a1ec21b3e7303834ac35b55fed60b2a6..a60d7b5c65ec39484ff11c6bbe869cbf8fcd1975 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -51,13 +51,12 @@ static std::vector GetTensorsName( } static void CheckInputVarStatus(const Tensor &tensor) { - PADDLE_ENFORCE_EQ( - tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true, - paddle::platform::errors::InvalidArgument( - "The input tensor %s of " - "RunProgram(Grad)Op holds " - "wrong type. Expect type is DenseTensor.", - tensor.name())); + PADDLE_ENFORCE_EQ(tensor.defined() && tensor.is_dense_tensor(), true, + paddle::platform::errors::InvalidArgument( + "The input tensor %s of " + "RunProgram(Grad)Op holds " + "wrong type. Expect type is DenseTensor.", + tensor.name())); PADDLE_ENFORCE_EQ(tensor.initialized(), true, paddle::platform::errors::InvalidArgument( @@ -74,7 +73,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, paddle::platform::errors::InvalidArgument( "dst_tensor shall be defined.")); - if (phi::DenseTensor::classof(dst_tensor.impl().get())) { + if (dst_tensor.is_dense_tensor()) { auto &src_tensor = src_var.Get(); PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true, paddle::platform::errors::InvalidArgument( @@ -88,7 +87,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, "RunProgram(Grad)Op's internal " "scope is not initialized.", name)); - } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) { + } else if (dst_tensor.is_selected_rows()) { auto &src_tensor = src_var.Get(); PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true, paddle::platform::errors::InvalidArgument( @@ -159,9 +158,6 @@ static void ShareTensorsFromScope( name)); CheckOutputVarStatus(*var, *tensors[i]); // share tensor - // TODO(dev): Determine Tensor type by scope.var - // auto tensor_base = tensors[i]->impl(); - // if (phi::DenseTensor::classof(tensor_base.get())) { if (var->IsType()) { auto &src_tensor = var->Get(); auto *dst_tensor = const_cast( @@ -169,7 +165,6 @@ static void ShareTensorsFromScope( VLOG(2) << "share " << name << " from scope"; *dst_tensor = src_tensor; } else if (var->IsType()) { - // } else if (phi::SelectedRows::classof(tensor_base.get())) { auto &src_tensor = var->Get(); auto *dst_tensor = const_cast( dynamic_cast(tensors[i]->impl().get())); @@ -202,7 +197,6 @@ inline void RunProgramAPI( "The OutScope of RunProgramGradOp should only hold one scope.")); // Step 2. prepare executor and init persistable variables - // NOTE(Aurelius84): While training some models, forward can be called many // times and then apply backpropagation all at once, such as Reinforcement // Learning. Tensor data in multi-step training should be saved into single @@ -277,11 +271,6 @@ inline void RunProgramGradAPI( // if all output vars are set to stop_gradient, grad op no need to executed if (x_grad.empty() && params_grad.empty()) return; - // TODO(dev): Remove this line hard code. And need to deal with the out_grad - // name problem. - // const_cast(out_grad[0]) - // .set_name("matmul_v2_0.tmp_0@GRAD"); - auto *global_block = BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); @@ -381,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase { VLOG(3) << "out_grads[0].size() : " << grads[0].size(); std::vector x_grad; std::vector params_grad; - ConstructGradTensors(x_, &x_grad); - ConstructGradTensors(params_, ¶ms_grad); + ConstructXGradTensors(x_, &x_grad); + ConstructParamGradTensors(params_, ¶ms_grad); std::vector x_grad_ptr; std::vector params_grad_ptr; for (auto &i : x_grad) { @@ -392,9 +381,6 @@ class GradNodeRunProgram : public egr::GradNodeBase { params_grad_ptr.emplace_back(&i); } - // auto x_grad_ptr = ConstructGradTensors(x_); - // auto params_grad_ptr = ConstructGradTensors(params_); - PADDLE_ENFORCE_EQ( grads[0].size(), fwd_out_names_.size(), paddle::platform::errors::InvalidArgument( @@ -412,7 +398,6 @@ class GradNodeRunProgram : public egr::GradNodeBase { params_grad_ptr); VLOG(3) << "End Eager Backward Node: GradNodeRunProgram"; return {x_grad, params_grad}; - // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; } void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } @@ -447,29 +432,35 @@ class GradNodeRunProgram : public egr::GradNodeBase { } protected: - void ConstructGradTensors( - const std::vector &fwd_tensors, - std::vector *grad_tensors) { + void ConstructXGradTensors( + const std::vector &x, + std::vector *x_grad) { // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, // such as: name, tensor type(DenseTensor or SelectedRows). - VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); - for (auto &fwd_t : fwd_tensors) { - if (phi::DenseTensor::classof(fwd_t.impl().get())) { - grad_tensors->emplace_back(std::make_shared()); - } else if (phi::SelectedRows::classof(fwd_t.impl().get())) { - grad_tensors->emplace_back(std::make_shared()); + for (auto &t : x) { + if (t.is_dense_tensor()) { + x_grad->emplace_back(std::make_shared()); + } else if (t.is_selected_rows()) { + x_grad->emplace_back(std::make_shared()); } - auto &grad_t = grad_tensors->back(); - grad_t.set_name(fwd_t.name() + "@GRAD"); + x_grad->back().set_name(t.name() + "@GRAD"); } } - void ConstructGradTensors( - const std::vector &fwd_tensors) { - VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); - for (auto &fwd_t : fwd_tensors) { - auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad(); - grad_tesnor.set_name(fwd_t.name() + "@GRAD"); + void ConstructParamGradTensors( + const std::vector ¶m, + std::vector *param_grad) { + for (auto &t : param) { + auto t_meta = egr::EagerUtils::unsafe_autograd_meta(t); + auto t_grad = egr::EagerUtils::unsafe_autograd_meta(t)->Grad(); + if (t_meta->StopGradient()) { + param_grad->emplace_back(); + } else if (t_grad.is_dense_tensor()) { + param_grad->emplace_back(std::make_shared()); + } else if (t_grad.is_selected_rows()) { + param_grad->emplace_back(std::make_shared()); + } + param_grad->back().set_name(t.name() + "@GRAD"); } } diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 20faae95281db87ad4896b19e63857cf4b7e5e02..f25c4dfcd5932c835e580d70272f56e351617a57 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -271,6 +271,7 @@ void EagerUtils::GetOutput(const std::shared_ptr& out, "shared_ptr, this error may indicate some outputs " "are nullptr")); out_var->set_impl(out->GetTensorBase()); + out_var->set_name(out->name()); } void EagerUtils::GetOutputs( diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 2b8b4b3ff9573f601f8da3092c18433a49a93869..ead6dd7e6898d98093046f338d80b1c8f60f17b2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -13,6 +13,9 @@ IF(WITH_GPU) nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) + #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS graph_gpu_ps) + # ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu) + # target_link_libraries(test_sample_rate graph_gpu_ps) ENDIF() IF(WITH_ROCM) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h index 235f7a226ad17649960d1e72d7907e8013e406fe..f18fa47fffd9adb6853286848e120541bd13d52f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -93,14 +93,17 @@ node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 struct NeighborSampleResult { int64_t *val; int *actual_sample_size, sample_size, key_size; + int *offset; NeighborSampleResult(int _sample_size, int _key_size) : sample_size(_sample_size), key_size(_key_size) { actual_sample_size = NULL; val = NULL; + offset = NULL; }; ~NeighborSampleResult() { if (val != NULL) cudaFree(val); if (actual_sample_size != NULL) cudaFree(actual_sample_size); + if (offset != NULL) cudaFree(offset); } }; diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu index 8c7ea10b26565a4181230f6150272babd315105f..0f7e38ac95e1b985da3ed34b61a05ac0c396c5e2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu @@ -71,10 +71,10 @@ TEST(TEST_FLEET, graph_sample) { */ ::paddle::distributed::GraphParameter table_proto; table_proto.set_gpups_mode(true); - table_proto.set_gpups_mode_shard_num(127); + table_proto.set_shard_num(127); table_proto.set_gpu_num(3); table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler"); - table_proto.set_gpups_graph_sample_args("5,5,1,1"); + table_proto.set_gpups_graph_sample_args("100,5,5,1,1"); prepare_file(edge_file_name, edges); g.init_cpu_table(table_proto); g.load(std::string(edge_file_name), std::string("e>")); @@ -93,16 +93,53 @@ TEST(TEST_FLEET, graph_sample) { cudaMalloc((void **)&key, 3 * sizeof(int64_t)); cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); - int64_t *res = new int64_t[9]; - cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); + int64_t *res = new int64_t[7]; + /* + cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost); std::sort(res, res + 3); - std::sort(res + 6, res + 9); - int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; - for (int i = 0; i < 9; i++) { + std::sort(res + 4, res + 7); + //int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; + int64_t expected_sample_val[] = {28, 29, 30, 0, 21, 22, 23}; + for (int i = 0; i < 7; i++) { + VLOG(0)<val, 56, cudaMemcpyDeviceToHost); + int *actual_sample_size = new int[3]; + cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size, 12, + cudaMemcpyDeviceToHost); // 3, 1, 3 + int *cumsum_sample_size = new int[3]; + cudaMemcpy(cumsum_sample_size, neighbor_sample_res->offset, 12, + cudaMemcpyDeviceToHost); // 0, 3, 4 + + std::vector> neighbors_; + std::vector neighbors_7 = {28, 29, 30, 31, 32, 33, 34, 35}; + std::vector neighbors_0 = {0}; + std::vector neighbors_6 = {21, 22, 23, 24, 25, 26, 27}; + neighbors_.push_back(neighbors_7); + neighbors_.push_back(neighbors_0); + neighbors_.push_back(neighbors_6); + for (int i = 0; i < 3; i++) { + for (int j = cumsum_sample_size[i]; + j < cumsum_sample_size[i] + actual_sample_size[i]; j++) { + bool flag = false; + for (int k = 0; k < neighbors_[i].size(); k++) { + if (res[j] == neighbors_[i][k]) { + flag = true; + break; + } + } + ASSERT_EQ(flag, true); + } + } + + delete[] res; + delete[] actual_sample_size; + delete[] cumsum_sample_size; + delete neighbor_sample_res; } diff --git a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu new file mode 100644 index 0000000000000000000000000000000000000000..a4b1a6a7aee1e9c9d4caa05f04908926688bb739 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu @@ -0,0 +1,280 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" +#include "paddle/fluid/platform/cuda_device_guard.h" + +using namespace paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace memory = paddle::memory; +namespace distributed = paddle::distributed; + +std::string input_file; +int fixed_key_size = 100, sample_size = 100, + bfs_sample_nodes_in_each_shard = 10000, init_search_size = 1, + bfs_sample_edges = 20; +std::vector edges = { + std::string("37\t45\t0.34"), std::string("37\t145\t0.31"), + std::string("37\t112\t0.21"), std::string("96\t48\t1.4"), + std::string("96\t247\t0.31"), std::string("96\t111\t1.21"), + std::string("59\t45\t0.34"), std::string("59\t145\t0.31"), + std::string("59\t122\t0.21"), std::string("97\t48\t0.34"), + std::string("97\t247\t0.31"), std::string("97\t111\t0.21")}; +// odd id:96 48 122 112 +char edge_file_name[] = "test_edges.txt"; + +void prepare_file(char file_name[], std::vector data) { + std::ofstream ofile; + ofile.open(file_name); + for (auto x : data) { + ofile << x << std::endl; + } + + ofile.close(); +} + +void testSampleRate() { +#ifdef PADDLE_WITH_HETERPS + std::vector ids; + int start = 0; + pthread_rwlock_t rwlock; + pthread_rwlock_init(&rwlock, NULL); + { + ::paddle::distributed::GraphParameter table_proto; + table_proto.set_gpups_mode(false); + table_proto.set_shard_num(127); + table_proto.set_task_pool_size(24); + std::cerr << "initializing begin"; + distributed::GraphTable graph_table; + graph_table.initialize(table_proto); + std::cerr << "initializing done"; + graph_table.load(input_file, std::string("e>")); + int sample_actual_size = -1; + int step = fixed_key_size, cur = 0; + while (sample_actual_size != 0) { + std::unique_ptr buffer; + graph_table.pull_graph_list(cur, step, buffer, sample_actual_size, false, + 1); + int index = 0; + while (index < sample_actual_size) { + paddle::distributed::FeatureNode node; + node.recover_from_buffer(buffer.get() + index); + index += node.get_size(false); + // res.push_back(node); + ids.push_back(node.get_id()); + int swap_pos = rand() % ids.size(); + std::swap(ids[swap_pos], ids[(int)ids.size() - 1]); + } + cur = ids.size(); + // if (sample_actual_size == 0) break; + // char *buff = buffer.get(); + // for (int i = 0; i < sample_actual_size/sizeof(int64_t); i++) { + // ids.push_back(*((int64_t *)buff + i)); + // int swap_pos = rand() % ids.size(); + // std::swap(ids[swap_pos], ids[(int)ids.size() - 1]); + // } + // cur += sample_actual_size/sizeof(int64_t); + } + std::cerr << "load ids done" << std::endl; + std::vector sample_id[10], sample_neighbors[10]; + std::vector actual_size[10]; + auto func = [&rwlock, &graph_table, &ids, &sample_id, &actual_size, + &sample_neighbors, &start](int i) { + while (true) { + int s, sn; + bool exit = false; + pthread_rwlock_wrlock(&rwlock); + if (start < ids.size()) { + s = start; + sn = ids.size() - start; + sn = min(sn, fixed_key_size); + start += sn; + } else { + exit = true; + } + pthread_rwlock_unlock(&rwlock); + if (exit) break; + std::vector> buffers(sn); + std::vector ac(sn); + auto status = graph_table.random_sample_neighbors( + ids.data() + s, sample_size, buffers, ac, false); + for (int j = s; j < s + sn; j++) { + sample_id[i].push_back(ids[j]); + actual_size[i].push_back(ac[j - s] / sizeof(int64_t)); + int ss = ac[j - s] / sizeof(int64_t); + for (int k = 0; k < ss; k++) { + sample_neighbors[i].push_back( + *((int64_t *)(buffers[j - s].get() + k * sizeof(int64_t)))); + } + } + } + VLOG(0) << "func " << i << " returns "; + }; + auto start1 = std::chrono::steady_clock::now(); + std::thread thr[10]; + for (int i = 0; i < 10; i++) { + thr[i] = std::thread(func, i); + } + for (int i = 0; i < 10; i++) thr[i].join(); + auto end1 = std::chrono::steady_clock::now(); + auto tt = + std::chrono::duration_cast(end1 - start1); + std::cerr << "total time cost without cache is " << tt.count() << " us" + << std::endl; + } + const int gpu_num = 8; + ::paddle::distributed::GraphParameter table_proto; + table_proto.set_gpups_mode(true); + table_proto.set_shard_num(127); + table_proto.set_gpu_num(gpu_num); + table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler"); + table_proto.set_gpups_graph_sample_args(std::to_string(init_search_size) + + ",100000000,10000000,1,1"); + std::vector dev_ids; + for (int i = 0; i < gpu_num; i++) { + dev_ids.push_back(i); + } + std::shared_ptr resource = + std::make_shared(dev_ids); + resource->enable_p2p(); + GpuPsGraphTable g(resource); + g.init_cpu_table(table_proto); + g.load(std::string(input_file), std::string("e>")); + NodeQueryResult *query_node_res; + query_node_res = g.query_node_list(0, 0, ids.size() + 10000); + + VLOG(0) << "gpu got " << query_node_res->actual_sample_size << " nodes "; + VLOG(0) << "cpu got " << ids.size() << " nodes"; + ASSERT_EQ((int)query_node_res->actual_sample_size, (int)ids.size()); + + int64_t *gpu_node_res = new int64_t[ids.size()]; + cudaMemcpy(gpu_node_res, query_node_res->val, ids.size() * sizeof(int64_t), + cudaMemcpyDeviceToHost); + std::unordered_set cpu_node_set, gpu_node_set; + for (auto x : ids) { + cpu_node_set.insert(x); + } + for (int i = 0; i < (int)query_node_res->actual_sample_size; i++) { + auto x = gpu_node_res[i]; + ASSERT_EQ(cpu_node_set.find(x) != cpu_node_set.end(), true); + gpu_node_set.insert(x); + } + VLOG(0) << " cpu_node_size = " << cpu_node_set.size(); + VLOG(0) << " gpu_node_size = " << gpu_node_set.size(); + ASSERT_EQ(cpu_node_set.size(), gpu_node_set.size()); + for (int i = 0; i < 20; i++) { + int st = ids.size() / 20 * i; + auto q = g.query_node_list(0, st, ids.size() / 20); + VLOG(0) << " the " << i << "th iteration size = " << q->actual_sample_size; + } +// NodeQueryResult *query_node_list(int gpu_id, int start, int query_size); + +/* + void *key; + cudaMalloc((void **)&key, ids.size() * sizeof(int64_t)); + cudaMemcpy(key, ids.data(), ids.size() * sizeof(int64_t), + cudaMemcpyHostToDevice); + std::vector res[gpu_num]; + start = 0; + auto func = [&rwlock, &g, &res, &start, + &gpu_num, &ids, &key](int i) { + while (true) { + int s, sn; + bool exit = false; + pthread_rwlock_wrlock(&rwlock); + if (start < ids.size()) { + s = start; + sn = ids.size() - start; + sn = min(sn, fixed_key_size); + start += sn; + } else { + exit = true; + } + pthread_rwlock_unlock(&rwlock); + if (exit) break; + auto r = + g.graph_neighbor_sample(i, (int64_t *)(key + s), sample_size, sn); + res[i].push_back(r); + } + }; + auto start1 = std::chrono::steady_clock::now(); + std::thread thr[gpu_num]; + for (int i = 0; i < gpu_num; i++) { + thr[i] = std::thread(func, i); + } + for (int i = 0; i < gpu_num; i++) thr[i].join(); + auto end1 = std::chrono::steady_clock::now(); + auto tt = + std::chrono::duration_cast(end1 - start1); + std::cerr << "total time cost without cache is " << tt.count() << " us" + << std::endl; +*/ +#endif +} + +// TEST(testSampleRate, Run) { testSampleRate(); } + +int main(int argc, char *argv[]) { + for (int i = 0; i < argc; i++) + VLOG(0) << "Argument " << i << " is " << std::string(argv[i]); + if (argc > 1) { + input_file = argv[1]; + } else { + prepare_file(edge_file_name, edges); + input_file = edge_file_name; + } + VLOG(0) << "input_file is " << input_file; + if (argc > 2) { + fixed_key_size = std::stoi(argv[2]); + } + VLOG(0) << "sample_node_size for every batch is " << fixed_key_size; + if (argc > 3) { + sample_size = std::stoi(argv[3]); + } + VLOG(0) << "sample_size neighbor_size is " << sample_size; + if (argc > 4) init_search_size = std::stoi(argv[4]); + VLOG(0) << " init_search_size " << init_search_size; + testSampleRate(); +} diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc index ed9f6230720f83100e641068c8664d643b6db260..60f4e4b309c5d817a0d0e8eaf07f505d55837b93 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/program_desc.h" -USE_OP(mul); +USE_OP_ITSELF(mul); USE_OP(cinn_launch); USE_OP_ITSELF(elementwise_add); namespace paddle::framework { diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 62e801b76955d74f15bfd81f8da641671de7307b..25cb15d2cc8c27e5fa1477e60e4428d5823495dd 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -234,10 +234,26 @@ void InterpreterCore::Convert( gc_check_input_list.erase(last, gc_check_input_list.end()); for (auto var_id : gc_check_input_list) { - vec_meta_info[var_id].var_ref_count_++; - instr.AddGCCheckVar(var_id); - VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after " - << instr.OpBase()->Type(); + paddle::framework::Variable* var = global_scope_->Var(var_id); + if (var->IsType() || var->IsType() || + var->IsType()) { + vec_meta_info[var_id].var_ref_count_++; + // TODO(zhiqiu): not all var needs to be checked, var need to be checked + // only + // after the last_live_op. For example, + // b = op1(a) + // c = op2(a, b) + // in this case, a is the input of op1 and op2, we only need to check + // a after op2, because op2 always uses a after op1. + instr.AddGCCheckVar(var_id); + VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after " + << instr.OpBase()->Type(); + } else { + VLOG(4) << "not clear " << global_scope_->GetNameById(var_id) + << " after " << instr.OpBase()->Type() + << " because its type is " + << framework::ToTypeName(var->Type()); + } } } diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index 47dffd47b7cbbf4a37e6715b40d41024330bc679..c11c7124b62777a8a5da12f67594d01f255f6637 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -674,7 +674,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { } // namespace paddle USE_PASS(build_cinn_pass); -USE_OP(mul); +USE_OP_ITSELF(mul); USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(relu_grad); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index cdccc4c5546900a141a084281f419c2940b23817..44f4424d70d4c85e1a4d447feba14a3f1b92c267 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -300,6 +300,6 @@ TEST(CinnCompilerTest, Compile) { USE_PASS(build_cinn_pass); USE_PASS(graph_viz_pass); -USE_OP(mul); +USE_OP_ITSELF(mul); USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/tests/test_eager.cc b/paddle/fluid/imperative/tests/test_eager.cc index 7ec21385bb73750aaea501fb3b99c2ea76f52a68..4a0b99518a63f87e67c33d96f06a39e1e904cee1 100644 --- a/paddle/fluid/imperative/tests/test_eager.cc +++ b/paddle/fluid/imperative/tests/test_eager.cc @@ -98,4 +98,4 @@ TEST(test_var_helper, eager_var_helper) { } // namespace imperative } // namespace paddle -USE_OP(mul); +USE_OP_ITSELF(mul); diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 02a1689c23a3fe5e1543a2e52d7661d5997bc062..eb7e327662c3013e0ac6ee8ef12dbe484db2a4ce 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -28,6 +28,8 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT); namespace platform = paddle::platform; namespace framework = paddle::framework; @@ -267,7 +269,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) { } // namespace imperative } // namespace paddle -USE_OP(mul); -USE_OP(mul_grad); +USE_OP_ITSELF(mul); +USE_OP_ITSELF(mul_grad); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc index 3fa87d415db0d9946ce3df8b95dc3641bbe9f9c3..3e5ab9ab9636801b487ed536ae486bf95ff63de1 100644 --- a/paddle/fluid/imperative/tests/test_layer.cc +++ b/paddle/fluid/imperative/tests/test_layer.cc @@ -416,4 +416,4 @@ TEST(test_layer, test_eager) { } // namespace imperative } // namespace paddle -USE_OP(mul); +USE_OP_ITSELF(mul); diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index 75876e07fb5c78fb6ec6949489efac9fcf618a69..1c3a04b51abd036325801af484bb1d800152c328 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -34,9 +34,13 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_with_flatten, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_with_flatten_grad, GPU, ALL_LAYOUT); #endif namespace imperative = paddle::imperative; @@ -598,8 +602,8 @@ TEST(test_tracer, eager_tracer) { } // namespace imperative } // namespace paddle -USE_OP(mul); -USE_OP(mul_grad); +USE_OP_ITSELF(mul); +USE_OP_ITSELF(mul_grad); USE_OP_ITSELF(reduce_sum); USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc index 1ae2668e733aad23241c63b9985e708396d0b1bc..8134d389469cbe7d654fd675a75a8123257339b1 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -43,4 +43,4 @@ TEST(fc_op, test) { } // namespace tensorrt } // namespace inference } // namespace paddle -USE_OP(mul); +USE_OP_ITSELF(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index 282f53559aa75b2c7c252450e392e1996f9b1d81..86cb7543d42da65cc9f82cd13b06610fe532c164 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -46,4 +46,4 @@ TEST(MulOpConverter, main) { } // namespace inference } // namespace paddle -USE_OP(mul); +USE_OP_ITSELF(mul); diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index e8964765ec6549c106f877341b3d013cfe102e25..813dce6080130c0e4894f085c8c199e147e275bb 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -65,9 +65,10 @@ class MeanCUDAKernel : public framework::OpKernel { for (decltype(rank) i = 0; i < rank; ++i) { reduce_dims.push_back(i); } - TensorReduceImpl( - context.cuda_device_context(), *input, output, Div(numel), reduce_dims, - stream); + TensorReduceImpl>( + context.cuda_device_context(), *input, output, + kps::IdentityFunctor(), reduce_dims, stream, true); } }; diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index fe9faab7d6449c6c9ca7b3fbda30662e6174eace..0f70b67bbbd68d00f2d8f341e3d797062aa14b64 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include -#include "paddle/fluid/operators/mul_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace phi { @@ -46,6 +46,9 @@ using dnnl::memory; using dnnl::prop_kind; using dnnl::stream; +constexpr int kMULMKLDNNINT8 = 1; +constexpr int kMULMKLDNNFP32 = 2; + template class MulPrimitiveFactory { public: diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index bc57b429127f0a909eda133de30cdd84acc42b32..6738f15ef74c67b336fbebf3f0ff9cd37a0d93da 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/mul_op.h" #include #include #include #include +#include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -27,6 +27,9 @@ namespace operators { using framework::OpKernelType; using framework::Tensor; +constexpr int kMULMKLDNNINT8 = 1; +constexpr int kMULMKLDNNFP32 = 2; + class MulOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -354,16 +357,3 @@ REGISTER_OPERATOR(mul_grad, ops::MulGradOp, ops::MulDoubleGradMaker); REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp); - -REGISTER_OP_CPU_KERNEL( - mul, ops::MulKernel, - ops::MulKernel); - -REGISTER_OP_CPU_KERNEL( - mul_grad, ops::MulGradKernel, - ops::MulGradKernel); - -REGISTER_OP_CPU_KERNEL( - mul_grad_grad, - ops::MulDoubleGradKernel, - ops::MulDoubleGradKernel); diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc deleted file mode 100644 index 6e841712b9bffc06ca56afddcb866af8b3f9b0d8..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mul_op.cu.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/mul_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel, - ops::MulKernel, - ops::MulKernel); -REGISTER_OP_CUDA_KERNEL( - mul_grad, ops::MulGradKernel, - ops::MulGradKernel, - ops::MulGradKernel); -REGISTER_OP_CUDA_KERNEL( - mul_grad_grad, - ops::MulDoubleGradKernel, - ops::MulDoubleGradKernel); diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h deleted file mode 100644 index ce91c6dd0edf1fde90a15b1929fa4560a65f555e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mul_op.h +++ /dev/null @@ -1,207 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -constexpr int kMULMKLDNNINT8 = 1; -constexpr int kMULMKLDNNFP32 = 2; - -template -class MulKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - const Tensor* y = context.Input("Y"); - Tensor* z = context.Output("Out"); - const Tensor x_matrix = - x->dims().size() > 2 - ? framework::ReshapeToMatrix( - *x, context.template Attr("x_num_col_dims")) - : *x; - const Tensor y_matrix = - y->dims().size() > 2 - ? framework::ReshapeToMatrix( - *y, context.template Attr("y_num_col_dims")) - : *y; - - z->mutable_data(context.GetPlace()); - auto z_dim = z->dims(); - if (z_dim.size() != 2) { - z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); - } - - auto blas = phi::funcs::GetBlas(context); - - blas.MatMul(x_matrix, y_matrix, z); - if (z_dim.size() != 2) { - z->Resize(z_dim); - } - } -}; - -template -class MulGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - int x_num_col_dims = ctx.template Attr("x_num_col_dims"); - int y_num_col_dims = ctx.template Attr("y_num_col_dims"); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto x_matrix = x->dims().size() > 2 - ? framework::ReshapeToMatrix(*x, x_num_col_dims) - : static_cast(*x); - auto y_matrix = y->dims().size() > 2 - ? framework::ReshapeToMatrix(*y, y_num_col_dims) - : static_cast(*y); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - Tensor dout_mat; - dout_mat.ShareDataWith(*dout); - dout_mat.Resize({phi::flatten_to_2d(x->dims(), x_num_col_dims)[0], - phi::flatten_to_2d(y->dims(), y_num_col_dims)[1]}); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - if (dx != nullptr) { - dx->set_lod(x->lod()); - } - if (dy != nullptr) { - dy->set_lod(y->lod()); - } - - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - if (dx) { - dx->mutable_data(ctx.GetPlace()); - Tensor dx_matrix = dx->dims().size() > 2 - ? framework::ReshapeToMatrix(*dx, x_num_col_dims) - : *dx; - - // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - Tensor dy_matrix = dy->dims().size() > 2 - ? framework::ReshapeToMatrix(*dy, y_num_col_dims) - : *dy; - // dy = x' * dout. dy K x N, dout : M x N, x : M x K - blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix); - } - } -}; - -template -class MulDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - int x_num_col_dims = ctx.template Attr("x_num_col_dims"); - int y_num_col_dims = ctx.template Attr("y_num_col_dims"); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto x_mat = x->dims().size() > 2 - ? framework::ReshapeToMatrix(*x, x_num_col_dims) - : static_cast(*x); - auto y_mat = y->dims().size() > 2 - ? framework::ReshapeToMatrix(*y, y_num_col_dims) - : static_cast(*y); - - const int m = phi::flatten_to_2d(x->dims(), x_num_col_dims)[0]; - const int n = phi::flatten_to_2d(y->dims(), y_num_col_dims)[1]; - - auto* dout = ctx.Input("DOut"); - Tensor dout_mat; - dout_mat.ShareDataWith(*dout); - dout_mat.Resize({m, n}); - - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* dx = ctx.Output("DX"); - auto* dy = ctx.Output("DY"); - auto* ddout = ctx.Output("DDOut"); - - Tensor ddout_mat; - if (ddout) { - ddout->set_lod(dout->lod()); - // allocate and reshape ddout - ddout->mutable_data(ctx.GetPlace()); - ddout_mat.ShareDataWith(*ddout); - ddout_mat.Resize({m, n}); - } - - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - // a flag to specify whether ddout value has been set, if flag - // is false, MatMul beta should be 0 to set ddout, if flag is - // true, MatMul beta should be 1 to add result to ddout. - bool ddout_flag = false; - if (ddx) { - auto ddx_mat = ddx->dims().size() > 2 - ? framework::ReshapeToMatrix(*ddx, x_num_col_dims) - : static_cast(*ddx); - - // dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N - if (dy) { - dy->set_lod(y->lod()); - // allocate and reshape dy - dy->mutable_data(ctx.GetPlace()); - Tensor dy_mat = dy->dims().size() > 2 - ? framework::ReshapeToMatrix(*dy, y_num_col_dims) - : *dy; - blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat); - } - // ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N - if (ddout) { - blas.MatMul(ddx_mat, false, y_mat, false, static_cast(1.0), - &ddout_mat, static_cast(ddout_flag)); - ddout_flag = true; - } - } - if (ddy) { - auto ddy_mat = ddy->dims().size() > 2 - ? framework::ReshapeToMatrix(*ddy, y_num_col_dims) - : static_cast(*ddy); - // dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K - if (dx) { - dx->set_lod(x->lod()); - // allocate and reshape dx - dx->mutable_data(ctx.GetPlace()); - Tensor dx_mat = dx->dims().size() > 2 - ? framework::ReshapeToMatrix(*dx, x_num_col_dims) - : *dx; - blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat); - } - // ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N - if (ddout) { - blas.MatMul(x_mat, false, ddy_mat, false, static_cast(1.0), - &ddout_mat, static_cast(ddout_flag)); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index e1fb5f4f6b0f827e753931d2d4970d2f7859f423..2aedfed9f8e497b7e931c9aead4ff3c8da92c19b 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/mul_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/mul_op_xpu.cc b/paddle/fluid/operators/mul_op_xpu.cc index 1fdaa2729909af54653ca840576993d521d1de77..6ef41e059c7d99c3327994a9fac6fdaf5290bfa5 100644 --- a/paddle/fluid/operators/mul_op_xpu.cc +++ b/paddle/fluid/operators/mul_op_xpu.cc @@ -14,11 +14,11 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/mul_op.h" #include #include #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 8771a6573cba044d182aced752d3a65c446ad32e..4e6ad35e612b7ea0392840be863f8820957765ac 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -14,8 +14,13 @@ limitations under the License. */ #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" + namespace paddle { namespace operators { @@ -25,44 +30,6 @@ class MultiplexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "Multiplex"); - PADDLE_ENFORCE_NE( - ctx->Inputs("X").empty(), true, - platform::errors::InvalidArgument("MultiInput(X) shouldn't be empty.")); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multiplex"); - auto ids_dim = ctx->GetInputDim("Ids"); - PADDLE_ENFORCE_EQ( - ids_dim.size(), 2, - platform::errors::PreconditionNotMet( - "The index tensor must be a vector with 2 dimensions")); - PADDLE_ENFORCE_EQ( - ids_dim[1], 1, - platform::errors::PreconditionNotMet( - "The index tensor must be a vector with batchSize x 1.")); - - auto ins_dims = ctx->GetInputsDim("X"); - auto num_ins = ins_dims.size(); - PADDLE_ENFORCE_GT(num_ins, 1, - platform::errors::InvalidArgument( - "multiplex operator should have more than " - "one candidate input tensors.")); - - auto in_dim = ins_dims[0]; - PADDLE_ENFORCE_GE( - in_dim.size(), 2, - platform::errors::InvalidArgument( - "The rank of candidate tensors must be not less than 2.")); - for (size_t i = 1; i < num_ins; i++) { - auto dim = ins_dims[i]; - PADDLE_ENFORCE_EQ( - in_dim, dim, - platform::errors::PreconditionNotMet( - "All the candidate tensors must have the same size.")); - } - ctx->SetOutputDim("Out", in_dim); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,8 +131,11 @@ class MultiplexGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(multiplex, MultiplexInferShapeFunctor, + PD_INFER_META(phi::MultiplexInferMeta)); REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, ops::MultiplexGradMaker, - ops::MultiplexGradMaker); + ops::MultiplexGradMaker, + MultiplexInferShapeFunctor); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc index 82fc9ef1b7858992c49f537ce8608856ef6b6fde..02d5e5f03f02e2ca796555aeeb93167cb916a7c6 100644 --- a/paddle/fluid/operators/qr_op.cc +++ b/paddle/fluid/operators/qr_op.cc @@ -21,6 +21,10 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,43 +33,6 @@ using DDim = framework::DDim; class QrOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "qr"); - OP_INOUT_CHECK(ctx->HasOutput("Q"), "Output", "Q", "qr"); - OP_INOUT_CHECK(ctx->HasOutput("R"), "Output", "R", "qr"); - - auto x_dims = ctx->GetInputDim("X"); - int x_rank = x_dims.size(); - PADDLE_ENFORCE_GE(x_dims.size(), 2, - platform::errors::InvalidArgument( - "the rank of input must greater than 2")); - bool compute_q; - bool reduced_mode; - int m = x_dims[x_rank - 2]; - int n = x_dims[x_rank - 1]; - int min_mn = std::min(m, n); - std::string mode = ctx->Attrs().Get("mode"); - std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); - - if (compute_q) { - int k = reduced_mode ? min_mn : m; - auto q_dims_vec = phi::vectorize(x_dims); - q_dims_vec[q_dims_vec.size() - 1] = k; - ctx->SetOutputDim("Q", phi::make_ddim(q_dims_vec)); - } else { - ctx->SetOutputDim("Q", phi::make_ddim({0})); - } - - int k = reduced_mode ? min_mn : m; - auto r_dims_vec = phi::vectorize(x_dims); - r_dims_vec[r_dims_vec.size() - 2] = k; - r_dims_vec[r_dims_vec.size() - 1] = n; - ctx->SetOutputDim("R", phi::make_ddim(r_dims_vec)); - - ctx->ShareLoD("X", /*->*/ "Q"); - ctx->ShareLoD("X", /*->*/ "R"); - } }; class QrOpMaker : public framework::OpProtoAndCheckerMaker { @@ -83,10 +50,8 @@ class QrOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault("reduced"); AddComment(R"DOC( Qr Operator. - This operator is used to perform QR operation for batched matrics $X$. $$Q, R = qr(X)$$ - )DOC"); } }; @@ -138,10 +103,13 @@ class QrGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(qr, QrInferShapeFunctor, + PD_INFER_META(phi::QrInferMeta)); REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, ops::QrGradMaker, - ops::QrGradMaker); + ops::QrGradMaker, + QrInferShapeFunctor); REGISTER_OPERATOR(qr_grad, ops::QrGradOp); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 160617695338a9f2e140b7b418c93ef0d7c57e17..b21e41c5b8548273bcd81a882ed696819b1de62f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -33,12 +33,12 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, const framework::Tensor& x, framework::Tensor* y, const TransformOp& transform, const std::vector& origin_reduce_dims, - gpuStream_t stream) { + gpuStream_t stream, bool is_mean = false) { y->mutable_data(x.place()); phi::funcs::ReduceKernel( static_cast(dev_ctx), x, y, transform, - origin_reduce_dims); + origin_reduce_dims, is_mean); } } // namespace operators diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc index c8010e8a128e0b2483c93ed38047b17060bfb0e9..b941fa3d03ae12928ab85486f1dabb54f3b514f4 100644 --- a/paddle/fluid/operators/tril_triu_op.cc +++ b/paddle/fluid/operators/tril_triu_op.cc @@ -13,29 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { class TrilTriuOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of TrilTriuOp is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of TrilTriuOp is not found.")); - const auto& x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 2, - platform::errors::InvalidArgument( - "Input(X)'s rank must be at least 2 in TrilTriuOp.")); - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class TrilTriuOpMaker : public framework::OpProtoAndCheckerMaker { @@ -100,7 +89,10 @@ class TrilTriuGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(tril_triu, TrilTriuInferShapeFunctor, + PD_INFER_META(phi::TrilTriuInferMeta)); REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, ops::TrilTriuGradOpMaker, - ops::TrilTriuGradOpMaker); + ops::TrilTriuGradOpMaker, + TrilTriuInferShapeFunctor); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h index 3b898ce77ce6fb43ca9aaba38e5db9e01a1d19d3..044c3d5d176e1a021952469db0623197b6302936 100644 --- a/paddle/fluid/pybind/custom_handwrite_op_funcs.h +++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h @@ -14,6 +14,7 @@ #pragma once #include +#include "paddle/phi/core/enforce.h" static PyObject *eager_api_run_program(PyObject *self, PyObject *args, PyObject *kwargs) { @@ -33,13 +34,24 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args, run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs); PyEval_RestoreThread(tstate); tstate = nullptr; + Py_RETURN_NONE; + } catch (paddle::platform::EnforceNotMet &exception) { + if (tstate) { + PyEval_RestoreThread(tstate); + } + std::ostringstream sout; + sout << exception.what(); + sout << " [operator < run_program > error]"; + exception.set_error_str(sout.str()); + ThrowExceptionToPython(std::current_exception()); + return nullptr; } catch (...) { if (tstate) { PyEval_RestoreThread(tstate); } ThrowExceptionToPython(std::current_exception()); + return nullptr; } - Py_RETURN_NONE; } static PyMethodDef CustomEagerFinalStateMethods[] = { diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index f3c48309e69fe8b40099b076f673e4ba8c8bcabd..30c34bb55f460417866e54520860df19921a335a 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -40,6 +40,9 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" + namespace paddle { namespace pybind { @@ -468,6 +471,90 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* eager_api_sparse_coo_tensor(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto non_zero_indices = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0); + auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 1), 1); + auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 2), 2); + auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); + PADDLE_ENFORCE(non_zero_indices.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the non-zero indices must be a DenseTensor.")); + PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the non-zero elements must be a DenseTensor.")); + auto dense_indices = + std::dynamic_pointer_cast(non_zero_indices.impl()); + auto dense_elements = + std::dynamic_pointer_cast(non_zero_elements.impl()); + // TODO(zhangkaihuo): After create SparseTensor, call coalesced() to sort and + // merge duplicate indices + std::shared_ptr coo_tensor = + std::make_shared(*dense_indices, *dense_elements, + phi::make_ddim(dense_shape)); + paddle::experimental::Tensor tensor; + tensor.set_impl(coo_tensor); + auto name = + egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + tensor.set_name(name); + auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor); + autograd_meta->SetStopGradient(static_cast(stop_gradient)); + if (!autograd_meta->GetMutableGradNode()) { + VLOG(3) << "Tensor(" << name + << ") have not GradNode, add GradNodeAccumulation for it."; + autograd_meta->SetGradNode( + std::make_shared(autograd_meta)); + } + return ToPyObject(tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_sparse_csr_tensor(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto non_zero_crows = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0); + auto non_zero_cols = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 1), 1); + auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 2), 2); + auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 3), 3); + auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4); + PADDLE_ENFORCE(non_zero_crows.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the compressed non-zero rows must be a DenseTensor.")); + PADDLE_ENFORCE(non_zero_cols.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the non-zero cols must be a DenseTensor.")); + PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the non-zero elements must be a DenseTensor.")); + + auto dense_crows = + std::dynamic_pointer_cast(non_zero_crows.impl()); + auto dense_cols = + std::dynamic_pointer_cast(non_zero_cols.impl()); + auto dense_elements = + std::dynamic_pointer_cast(non_zero_elements.impl()); + std::shared_ptr csr_tensor = + std::make_shared(*dense_crows, *dense_cols, + *dense_elements, + phi::make_ddim(dense_shape)); + paddle::experimental::Tensor tensor; + tensor.set_impl(csr_tensor); + auto name = + egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + tensor.set_name(name); + auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor); + autograd_meta->SetStopGradient(static_cast(stop_gradient)); + if (!autograd_meta->GetMutableGradNode()) { + VLOG(3) << "Tensor(" << name + << ") have not GradNode, add GradNodeAccumulation for it."; + autograd_meta->SetGradNode( + std::make_shared(autograd_meta)); + } + return ToPyObject(tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_functions[] = { // TODO(jiabin): Remove scale when we have final state tests {"scale", (PyCFunction)(void (*)(void))eager_api_scale, @@ -490,6 +577,14 @@ PyMethodDef variable_functions[] = { {"read_next_tensor_list", (PyCFunction)(void (*)(void))eager_api_read_next_tensor_list, METH_VARARGS | METH_KEYWORDS, NULL}, + /**sparse functions**/ + {"sparse_coo_tensor", + (PyCFunction)(void (*)(void))eager_api_sparse_coo_tensor, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"sparse_csr_tensor", + (PyCFunction)(void (*)(void))eager_api_sparse_csr_tensor, + METH_VARARGS | METH_KEYWORDS, NULL}, + /**sparse functions**/ {NULL, NULL, 0, NULL}}; void BindFunctions(PyObject* module) { diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index bb638ffd3a1e4177934d225e4025484c7a3efd67..cd47e04a3e9c24e78e5e36107bfa1d085ff16b19 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -959,11 +959,11 @@ static PyObject* tensor__set_grad_type(TensorObject* self, PyObject* args, EAGER_TRY auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0); auto grad_tensor = - egr::EagerUtils::unsafe_autograd_meta(self->tensor)->Grad(); + egr::EagerUtils::unsafe_autograd_meta(self->tensor)->MutableGrad(); if (var_type == framework::proto::VarType::LOD_TENSOR) { - grad_tensor.set_impl(std::make_shared()); + grad_tensor->set_impl(std::make_shared()); } else if (var_type == framework::proto::VarType::SELECTED_ROWS) { - grad_tensor.set_impl(std::make_shared()); + grad_tensor->set_impl(std::make_shared()); } return Py_None; EAGER_CATCH_AND_THROW_RETURN_NULL @@ -1097,6 +1097,49 @@ static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_method_to_sparse_coo(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + int64_t sparse_dim = CastPyArg2AttrLong(PyTuple_GET_ITEM(args, 0), 0); + auto coo_tensor = self->tensor.to_sparse_coo(sparse_dim); + egr::EagerUtils::autograd_meta(&coo_tensor) + ->SetStopGradient( + egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()); + egr::EagerUtils::autograd_meta(&coo_tensor) + ->SetPersistable( + egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); + return ToPyObject(coo_tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_to_sparse_csr(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto csr_tensor = self->tensor.to_sparse_csr(); + egr::EagerUtils::autograd_meta(&csr_tensor) + ->SetStopGradient( + egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()); + egr::EagerUtils::autograd_meta(&csr_tensor) + ->SetPersistable( + egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); + return ToPyObject(csr_tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_to_dense(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto dense_tensor = self->tensor.to_dense(); + egr::EagerUtils::autograd_meta(&dense_tensor) + ->SetStopGradient( + egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()); + egr::EagerUtils::autograd_meta(&dense_tensor) + ->SetPersistable( + egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); + return ToPyObject(dense_tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY @@ -1185,6 +1228,12 @@ PyMethodDef variable_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr, METH_VARARGS | METH_KEYWORDS, NULL}, + {"to_sparse_coo", (PyCFunction)(void (*)(void))tensor_method_to_sparse_coo, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"to_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_to_sparse_csr, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"to_dense", (PyCFunction)(void (*)(void))tensor_method_to_dense, + METH_VARARGS | METH_KEYWORDS, NULL}, /***the method of sparse tensor****/ {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version, METH_VARARGS | METH_KEYWORDS, NULL}, diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc index 43d356b6d6983afdca220029d34d9d5cd27da009..72d98d865a69eaed654b0c94ddc8578a58f8b298 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/trt_engine.cc @@ -33,19 +33,21 @@ namespace tensorrt { static nvinfer1::IBuilder* createInferBuilder( nvinfer1::ILogger& logger) { // NOLINT return static_cast( - phi::dynload::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); + ::phi::dynload::createInferBuilder_INTERNAL(&logger, + NV_TENSORRT_VERSION)); } static nvinfer1::IRuntime* createInferRuntime( nvinfer1::ILogger& logger) { // NOLINT return static_cast( - phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); + ::phi::dynload::createInferRuntime_INTERNAL(&logger, + NV_TENSORRT_VERSION)); } TrtEngine::TrtEngine(int device_id) : device_id_(device_id) { FreshDeviceId(); logger_.reset(new TrtLogger()); builder_.reset(createInferBuilder(logger_->GetTrtLogger())); - phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), ""); + ::phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), ""); } nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() { @@ -237,11 +239,11 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build, } void TrtEngine::PrepareOutputHandle(const std::string& out_name) { - phi::DenseTensor t; + ::phi::DenseTensor t; outputs_.emplace(out_name, t); } -phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) { +::phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) { return &outputs_[name]; } @@ -249,7 +251,7 @@ size_t TrtEngine::GetOutputNum() const { return outputs_.size(); } bool TrtEngine::SetUpInference( const InferenceOptions& inference, - const std::unordered_map& inputs) { + const std::unordered_map& inputs) { // TODO(wilber): now only create one exec_context FreshDeviceId(); CHECK(engine_ != nullptr); @@ -272,7 +274,7 @@ bool TrtEngine::SetUpInference( return true; } -void TrtEngine::Run(const phi::GPUContext& ctx) { +void TrtEngine::Run(const ::phi::GPUContext& ctx) { if (is_dynamic_shape_) { DynamicRun(ctx); } else { @@ -280,7 +282,7 @@ void TrtEngine::Run(const phi::GPUContext& ctx) { } } -void TrtEngine::StaticRun(const phi::GPUContext& ctx) { +void TrtEngine::StaticRun(const ::phi::GPUContext& ctx) { const int num_bindings = engine_->getNbBindings(); std::vector buffers(num_bindings, nullptr); @@ -291,7 +293,8 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { buffers[bind_index] = const_cast(static_cast(bind.buffer->data())); if (runtime_batch != -1) { - CHECK_EQ(runtime_batch, phi::vectorize(bind.buffer->dims())[0]); + CHECK_EQ(runtime_batch, + ::phi::vectorize(bind.buffer->dims())[0]); } runtime_batch = bind.buffer->dims()[0]; } @@ -306,7 +309,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { for (int i = 0; i < dims.nbDims; ++i) { ddim.push_back(dims.d[i]); } - bind.buffer->Resize(phi::make_ddim(ddim)); + bind.buffer->Resize(::phi::make_ddim(ddim)); // TODO(wilber): now only support float output. ctx.Alloc(bind.buffer, sizeof(float) * bind.buffer->numel()); buffers[bind_index] = static_cast(bind.buffer->data()); @@ -316,7 +319,7 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { runtime_batch, buffers.data(), ctx.stream(), nullptr); } -void TrtEngine::DynamicRun(const phi::GPUContext& ctx) { +void TrtEngine::DynamicRun(const ::phi::GPUContext& ctx) { const int num_bindings = engine_->getNbBindings(); std::vector buffers(num_bindings, nullptr); @@ -344,7 +347,7 @@ void TrtEngine::DynamicRun(const phi::GPUContext& ctx) { for (int i = 0; i < dims.nbDims; ++i) { ddim[i] = dims.d[i]; } - bind.buffer->Resize(phi::make_ddim(ddim)); + bind.buffer->Resize(::phi::make_ddim(ddim)); ctx.Alloc(bind.buffer, sizeof(float) * bind.buffer->numel()); buffers[bind_index] = static_cast(bind.buffer->data()); } @@ -356,7 +359,7 @@ void TrtEngine::FreshDeviceId() { int count; cudaGetDeviceCount(&count); CHECK_LT(device_id_, count); - phi::backends::gpu::SetDeviceId(device_id_); + ::phi::backends::gpu::SetDeviceId(device_id_); } void TrtEngine::GetEngineInfo() { diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h index a26474f8cbb357d42cd6d951829bbdc24a256640..41d11a711170921da4fc9beae37e57e811dee769 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.h +++ b/paddle/infrt/backends/tensorrt/trt_engine.h @@ -76,19 +76,19 @@ class TrtEngine { const BuildOptions& build_options); // TODO(wilber): Modify signature after infrt-trt ready. - void Run(const phi::GPUContext& ctx); + void Run(const ::phi::GPUContext& ctx); // TODO(wilber): How to support multiple execution contexts? bool SetUpInference( const InferenceOptions& inference, - const std::unordered_map& inputs); + const std::unordered_map& inputs); void GetEngineInfo(); void PrepareOutputHandle(const std::string& out_name); // TODO(wilber): The output tensor names are: output_0, output_1, ... - phi::DenseTensor* GetOutput(const std::string&); + ::phi::DenseTensor* GetOutput(const std::string&); size_t GetOutputNum() const; @@ -104,9 +104,9 @@ class TrtEngine { bool ModelToBuildEnv(TrtUniquePtr network, const BuildOptions& build); - void StaticRun(const phi::GPUContext& ctx); + void StaticRun(const ::phi::GPUContext& ctx); - void DynamicRun(const phi::GPUContext& ctx); + void DynamicRun(const ::phi::GPUContext& ctx); private: std::unique_ptr logger_{nullptr}; @@ -118,7 +118,7 @@ class TrtEngine { std::vector> bindings_; int device_id_{0}; bool is_dynamic_shape_{false}; - std::unordered_map outputs_; + std::unordered_map outputs_; }; } // namespace tensorrt diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h index c66a850ffb1cc23a24074cbedaed62f7ec87beec..c23d4608bb33fc5de95d7daf913b89d22305fd81 100644 --- a/paddle/infrt/backends/tensorrt/trt_utils.h +++ b/paddle/infrt/backends/tensorrt/trt_utils.h @@ -92,7 +92,7 @@ class TrtLogger : public nvinfer1::ILogger { struct Binding { bool is_input{false}; nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT}; - phi::DenseTensor* buffer{nullptr}; + ::phi::DenseTensor* buffer{nullptr}; std::string name; }; @@ -103,7 +103,7 @@ class Bindings { void AddBinding(int32_t b, const std::string& name, bool is_input, - phi::DenseTensor* buffer, + ::phi::DenseTensor* buffer, nvinfer1::DataType data_type) { while (bindings_.size() <= static_cast(b)) { bindings_.emplace_back(); diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index 3af7033d2f4c7f434e00d25619df8c5ecf85c759..9df9abe18cbf09b2e521383272e7e758491a2ed3 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -97,4 +97,17 @@ def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { let results = (outs DenseTensor:$output); } +// TODO(wilber): Add a infrt_gpu dialect. +def PDT_GpuMemCopyOp : PDT_Op<"memcpy.gpu", [NoSideEffect]> { + let summary = "phi_dt.gpu.memcpy"; + let description = [{gpu memcpy d2h or h2d}]; + // TODO(wilber): add context argument to support stream. + let arguments = (ins + DenseTensor:$input, + Context:$context, + BoolAttr:$d2h + ); + let results = (outs DenseTensor:$output); +} + #endif diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc index 13cba6eeabb669cf93deb9a37d87d2ddff66e5c0..4abdb388dc23c4be1280e1b33097fe55d8655710 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc @@ -97,12 +97,13 @@ void PhiOpConvertPass::convertStage() { } auto loc = getFunction().getLoc(); builder.setInsertionPoint(op); - if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) { - std::string kernel_name = phi::TransToPhiKernelName(op_name); + + if (!::phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_name)) { + op_name = phi::TransToPhiKernelName(op_name); auto kernel_op = builder.create(loc, op->getResultTypes(), op->getOperands(), - kernel_name, + op_name, op->getAttrDictionary()); op->replaceAllUsesWith(kernel_op.getResults()); } else { diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc index 1cd5b5a85511fe20e8029185caf4c93d95979b72..070867853ad3e427f62c825727de2d15f0442c96 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc @@ -32,17 +32,24 @@ bool ProtoArgumentMappingContext::HasOutput(const std::string& name) const { } bool ProtoArgumentMappingContext::HasAttr(const std::string& name) const { + if (name == "is_test") return true; return op_->hasAttr(name); } paddle::any ProtoArgumentMappingContext::Attr(const std::string& name) const { - mlir::Attribute attrs = op_->getAttr(name); - if (mlir::StringAttr str_attr = attrs.dyn_cast_or_null()) { + if (name == "is_test") { + return paddle::any(true); + } + mlir::Attribute attr = op_->getAttr(name); + if (!attr) { + return paddle::any(); + } + if (mlir::StringAttr str_attr = attr.dyn_cast()) { return paddle::any(str_attr.str()); - } else { - // ToDO: implementation in the ext PR. - return paddle::any(0); } + + // ToDO: implementation in the ext PR. + return paddle::any(0); } size_t ProtoArgumentMappingContext::InputSize(const std::string& name) const { diff --git a/paddle/infrt/dialect/tensorrt/CMakeLists.txt b/paddle/infrt/dialect/tensorrt/CMakeLists.txt index 99c335ed1782e8089f77bb3f21aadb00f6f6864f..5b62b78e4dab14cc16bf275105937bd5df210b72 100755 --- a/paddle/infrt/dialect/tensorrt/CMakeLists.txt +++ b/paddle/infrt/dialect/tensorrt/CMakeLists.txt @@ -6,6 +6,7 @@ gather_srcs(infrt_src SRCS trt_op_teller_pass.cc trt_graph_fuse_pass.cc trt_graph_split_pass.cc + trt_type_convert_pass.cc ) mlir_tablegen_on(trt_ops) mlir_add_rewriter(pd_lower_to_trt) diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc index 7af1fa53d12e3113d0fe51e7ba15bbd5c082456c..be239255ffb1bd3e2fbd8a3ef471f78753fe0c6c 100644 --- a/paddle/infrt/dialect/tensorrt/trt_exec.cc +++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc @@ -21,6 +21,26 @@ #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" +#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h" + +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" + +#include "paddle/infrt/kernel/basic_kernels.h" +#include "paddle/infrt/kernel/control_flow_kernels.h" +#include "paddle/infrt/kernel/tensor_kernels.h" +#include "paddle/infrt/kernel/tensor_shape_kernels.h" +#include "paddle/infrt/kernel/test_kernels.h" + +#include "paddle/infrt/kernel/tensorrt/registry.h" + +#ifdef INFRT_WITH_PHI +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" +#include "paddle/infrt/kernel/phi/registry.h" +#endif int main(int argc, char** argv) { static llvm::cl::opt input_file( @@ -33,6 +53,22 @@ int main(int argc, char** argv) { mlir::MLIRContext* context = infrt::Global::getMLIRContext(); auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context); + infrt::host_context::KernelRegistry registry; + + ::infrt::kernel::RegisterBasicKernels(®istry); + ::infrt::kernel::RegisterTestKernels(®istry); + ::infrt::kernel::RegisterTensorShapeKernels(®istry); + ::infrt::kernel::RegisterTensorKernels(®istry); + ::infrt::kernel::RegisterControlFlowKernels(®istry); +#ifdef INFRT_WITH_PHI + ::infrt::kernel::RegisterPhiKernels(®istry); + ::infrt::kernel::RegisterInferShapeLaunchers(®istry); +#endif +#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) + ::infrt::kernel::RegisterTrtKernels(®istry); +#endif + + context->loadAllAvailableDialects(); module->dump(); mlir::PassManager pm(context); @@ -41,10 +77,12 @@ int main(int argc, char** argv) { trt_pass_manager.addPass(std::make_unique()); trt_pass_manager.addPass(std::make_unique(1)); trt_pass_manager.addPass(std::make_unique()); + trt_pass_manager.addPass(infrt::trt::createTrtTypeConvertPass()); if (mlir::failed(pm.run(*module))) { std::cout << "\npass failed!\n" << std::endl; return 4; } module->dump(); + ::infrt::host_context::TestMlir(module.get(), ®istry); return 0; } diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index 19c6b13e971ec779ed178413ca08b42b23dc71d1..1e50b772e081705ec81bd6b093cd9be9b1987bf6 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -12,10 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" + +#include #include #include + +#include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { namespace trt { @@ -41,34 +48,34 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern { ::llvm::SmallVector(1, EngineType::get()), trt_inputs, true /*run_once*/); - ::mlir::Block *block = new ::mlir::Block; - block->getOperations().splice(block->begin(), - casted_op.getBody()->getOperations(), - casted_op.getBody()->begin(), - casted_op.getBody()->end()); - create_engine_op.body().push_back(block); + auto &block = create_engine_op.body().emplaceBlock(); + block.getOperations().splice(block.begin(), + casted_op.getBody()->getOperations(), + casted_op.getBody()->begin(), + casted_op.getBody()->end()); - // trt.execute - // outputs - ::llvm::SmallVector<::mlir::Type, 4> execute_outputs_types; - for (auto v : casted_op.getODSResults(0)) { - execute_outputs_types.push_back(v.getType()); - } - // inputs - ::mlir::SmallVector<::mlir::Value, 4> execute_inputs( - create_engine_op.getODSResults(0)); - for (auto v : inputs) { - execute_inputs.push_back(v); - } - auto execute_op = rewriter.create( - ods_loc, execute_outputs_types, execute_inputs); - - ::llvm::SmallVector<::mlir::Value, 4> replace_values; - for (auto v : - ::llvm::SmallVector<::mlir::Value, 4>{execute_op.getODSResults(0)}) { - replace_values.push_back(v); + // trt.compute + ::llvm::SmallVector<::mlir::Value, 4> replace_values2; + auto ctx_op = rewriter.create<::infrt::phi::CreateGPUContextOp>( + ods_loc, + infrt::phi::ContextType::get(rewriter.getContext(), + infrt::TargetType::GPU)); + auto compute_op = rewriter.create( + ods_loc, + ::infrt::DenseTensorListType::get(rewriter.getContext()), + create_engine_op.engine(), + ctx_op.output()); + auto tensor_list_val = compute_op.outputs(); + for (size_t i = 0; i < casted_op.getNumResults(); ++i) { + auto res = casted_op->getResult(i); + auto int_attr = mlir::IntegerAttr::get( + mlir::IntegerType::get(rewriter.getContext(), 32), i); + auto get_tensor_op = rewriter.create<::infrt::dt::TensorListGetTensorOp>( + ods_loc, res.getType(), tensor_list_val, int_attr); + replace_values2.push_back(get_tensor_op.output()); } - rewriter.replaceOp(op, replace_values); + ctx_op->moveBefore(ctx_op->getBlock(), ctx_op->getBlock()->begin()); + rewriter.replaceOp(op, replace_values2); return ::mlir::success(); } }; @@ -82,6 +89,9 @@ void TRTOpConverterPass::runOnOperation() { // this lowering. In our case, we are lowering to TensorRTDialect from // PaddleDialect target.addLegalDialect(); + target.addLegalDialect<::infrt::phi::PHIDialect>(); + target.addLegalDialect<::infrt::dt::DTDialect>(); + target.addLegalDialect(); // Now that the conversion target has been defined, we just need to provide // the set of patterns that will lower the TensorRT operations. diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index ef9ccc82678f4bf2e2b518bf346d25393b9e480c..5918be90cdd303496bac93cec4483bef04d567d0 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -14,7 +14,9 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" +#include #include +#include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h" @@ -35,10 +37,12 @@ void TRTOpTellerPass::runOnFunction() { auto *op = worklist.back(); worklist.pop_back(); if (op == nullptr) continue; + if (op->getName().getStringRef().substr(0, 3) != "pd.") continue; if (::llvm::dyn_cast_or_null(op)) continue; if (::llvm::dyn_cast_or_null(op)) continue; if (::llvm::dyn_cast_or_null(op)) continue; if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue; + builder.setInsertionPoint(op); auto loc = getFunction().getLoc(); auto graph_op = builder.create( diff --git a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..cd55fef696a0e2b65a775f3c5c4763cc505d777f --- /dev/null +++ b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h" + +#include + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" +#include "mlir/IR/Block.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "paddle/infrt/dialect/infrt/common/types.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" + +namespace { + +class TrtTypeConvertPass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "TrtTypeConvertPass"; } + + void runOnFunction() override; +}; + +void TrtTypeConvertPass::runOnFunction() { + mlir::Block& body = getFunction().front(); + auto* mlir_ctx = getFunction()->getContext(); + mlir::OpBuilder builder(&body, body.begin()); + + std::vector worklist; + mlir::Operation* ctx_op{nullptr}; + worklist.reserve(body.getOperations().size()); + for (auto& op : body) { + worklist.push_back(&op); + if (op.getName().getStringRef() == "phi_dt.create_context.gpu") { + ctx_op = &op; + } + } + + ::infrt::LayoutType layout = ::infrt::LayoutType::NCHW; + ::infrt::TargetType target = ::infrt::TargetType::GPU; + for (auto& op : worklist) { + if (auto tensor_map_get_op = + llvm::dyn_cast<::infrt::phi::TensorMapGetTensorOp>(op)) { + auto res = tensor_map_get_op.output(); + if (auto t = res.getType().dyn_cast<::infrt::DenseTensorType>()) { + auto replace_type = ::infrt::DenseTensorType::get( + mlir_ctx, t.getTarget(), t.getPrecision(), layout); + res.setType(replace_type); + } + } + if (auto create_engine = llvm::dyn_cast<::infrt::trt::CreateEngineOp>(op)) { + // Insert `infrt.gpu.memcpy` op. + for (auto arg : create_engine.getOperands()) { + if (mlir::Operation* producer = arg.getDefiningOp()) { + if (arg.getType().isa<::infrt::DenseTensorType>()) { + builder.setInsertionPointAfter(producer); + auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>(); + if (producer->getName().getStringRef() != + "phi_dt.tensor_map_get_tensor" && + t.getTarget() != ::infrt::TargetType::GPU) { + auto replace_type = ::infrt::DenseTensorType::get( + mlir_ctx, target, t.getPrecision(), layout); + CHECK_NOTNULL(ctx_op); + auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>( + arg.getLoc(), + replace_type, + arg, + llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op) + .output(), + mlir::BoolAttr::get(mlir_ctx, /*d2h*/ false)); + arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op); + } + } + } else { + auto blockArg = arg.cast(); + if (arg.getType().isa<::infrt::DenseTensorType>()) { + auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>(); + builder.setInsertionPointAfter(ctx_op); + auto replace_type = ::infrt::DenseTensorType::get( + mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout); + CHECK_NOTNULL(ctx_op); + auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>( + blockArg.getLoc(), + replace_type, + blockArg, + llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op) + .output(), + mlir::BoolAttr::get(mlir_ctx, /*d2h*/ false)); + arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op); + } + } + } + + // Change ops(in block) types. + auto& block = create_engine.getRegion().getBlocks().front(); + for (auto& op : block.without_terminator()) { + for (size_t i = 0; i < op.getNumResults(); ++i) { + if (auto t = op.getResult(i) + .getType() + .dyn_cast<::infrt::DenseTensorType>()) { + auto replace_type = ::infrt::DenseTensorType::get( + mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout); + op.getResult(i).setType(replace_type); + } + } + } + } else if (auto list_get_tensor_op = + llvm::dyn_cast<::infrt::dt::TensorListGetTensorOp>(op)) { + auto result = list_get_tensor_op.output(); + if (auto t = result.getType().dyn_cast<::infrt::DenseTensorType>()) { + result.setType(::infrt::DenseTensorType::get( + mlir_ctx, ::infrt::TargetType::GPU, t.getPrecision(), layout)); + } + } else if (auto return_op = llvm::dyn_cast<::infrt::ReturnOp>(op)) { + for (auto arg : return_op->getOperands()) { + if (auto t = arg.getType().dyn_cast<::infrt::DenseTensorType>()) { + if (t.getLayout() != ::infrt::LayoutType::ANY || + t.getTarget() != ::infrt::TargetType::CPU || + t.getPrecision() != ::infrt::PrecisionType::FLOAT32) { + builder.setInsertionPoint(return_op); + CHECK_NOTNULL(ctx_op); + auto mem_cpy_op = builder.create<::infrt::phi::GpuMemCopyOp>( + return_op.getLoc(), + ::infrt::DenseTensorType::get(mlir_ctx, + ::infrt::TargetType::CPU, + t.getPrecision(), + ::infrt::LayoutType::ANY), + arg, + llvm::dyn_cast<::infrt::phi::CreateGPUContextOp>(ctx_op) + .output(), + mlir::BoolAttr::get(mlir_ctx, /*d2h*/ true)); + arg.replaceAllUsesExcept(mem_cpy_op.output(), mem_cpy_op); + } + } + } + } + } +} + +} // namespace + +namespace infrt { +namespace trt { + +std::unique_ptr createTrtTypeConvertPass() { + return std::make_unique(); +} + +} // namespace trt +} // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..fbc30cdbeb7675285a09cb8abbfa45c38713c4c0 --- /dev/null +++ b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace infrt { +namespace trt { + +std::unique_ptr createTrtTypeConvertPass(); + +} // namespace trt +} // namespace infrt diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index 7e90f225cffa753fdc8f1ee39cd5fd69d676d8c9..007730151e370da4f53da74b302c4ff43f4b2238 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -130,7 +130,7 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( if (attr.isa()) { auto val = attr.cast(); if (val.getType().isInteger(32)) { - return val.getInt(); + return val.getValue().getSExtValue(); } } return boost::none; @@ -142,7 +142,7 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( if (attr.isa()) { auto val = attr.cast(); if (val.getType().isInteger(64)) { - return val.getInt(); + return val.getValue().getSExtValue(); } } return boost::none; @@ -233,7 +233,7 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( \ std::vector res; \ for (auto& v : array) { \ - res.push_back(v.cast().getInt()); \ + res.push_back(v.cast().getValue().getSExtValue()); \ } \ return res; \ } @@ -309,7 +309,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( arg_value = GetOpResult(upstream_op); } } - if (arg_value->is_type()) { + if (arg_value->is_type<::phi::DenseTensor>()) { impl_->runtime->FeedInArgs( std::make_pair(std::to_string(i), ValueRef(arg_value))); } diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 5b92d183b79da21cf9552e8a2f238928962f5832..b0f56f020f4866053d99b01cdc721dfd9f295a36 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -147,6 +147,7 @@ class Value : public common::Object { #endif explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {} explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {} + explicit Value(::phi::MetaConfig&& x) : data(std::move(x)) {} #ifdef INFRT_WITH_TRT explicit Value(::infrt::backends::tensorrt::TrtEngine&& x) : data(std::move(x)) {} diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc index b27eacf9e522d2bbb8b7ffd70ad57f54e5775499..f38a11077165c8368920f1b328e74c600c24f8d7 100644 --- a/paddle/infrt/kernel/phi/context_kernels.cc +++ b/paddle/infrt/kernel/phi/context_kernels.cc @@ -30,6 +30,7 @@ namespace phi { ::phi::GPUContext context; context.PartialInitWithoutAllocator(); context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{}); + context.SetHostAllocator(new backends::CpuPhiAllocator{}); context.PartialInitWithAllocator(); return context; } diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index c8b1bd8c9ebd26bb6f0b4dab4f84c578ab4e5320..66698d36b5504cfce8a0b50aefaf36cada730dfe 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" +#include "llvm/Support/ErrorHandling.h" #include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/infrt/kernel/phi/context_kernels.h" @@ -228,6 +229,69 @@ int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map) { return map.size(); } +#ifdef INFRT_WITH_GPU +inline size_t SizeOfDataType(::phi::DataType data_type) { + switch (data_type) { + case ::phi::DataType::BOOL: + case ::phi::DataType::UINT8: + case ::phi::DataType::INT8: + return 1; + case ::phi::DataType::BFLOAT16: + case ::phi::DataType::FLOAT16: + case ::phi::DataType::INT16: + case ::phi::DataType::UINT16: + return 2; + case ::phi::DataType::FLOAT32: + case ::phi::DataType::INT32: + case ::phi::DataType::UINT32: + return 4; + case ::phi::DataType::FLOAT64: + case ::phi::DataType::INT64: + case ::phi::DataType::UINT64: + case ::phi::DataType::COMPLEX64: + return 8; + case ::phi::DataType::COMPLEX128: + return 16; + case ::phi::DataType::UNDEFINED: + return 0; + default: + llvm_unreachable("should not reach here"); + return 0; + } + return 0; +} +::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input, + const ::phi::GPUContext& context, + bool d2h) { + if (d2h) { + ::phi::DenseTensor ret( + const_cast<::phi::Allocator*>(&context.GetHostAllocator()), + input.meta()); + CHECK(input.place().GetType() == ::phi::AllocationType::GPU); + // TODO(wilber): Add sync op and stream. + cudaMemcpyAsync(ret.data(), + input.data(), + SizeOfDataType(input.dtype()) * input.numel(), + cudaMemcpyDeviceToHost, + nullptr); + return ret; + } else { + // h2d + ::phi::DenseTensor ret( + const_cast<::phi::Allocator*>(&context.GetAllocator()), input.meta()); + CHECK(input.place().GetType() == ::phi::AllocationType::CPU || + input.place().GetType() == ::phi::AllocationType::GPUPINNED); + // TODO(wilber): Add sync op and stream. + cudaMemcpyAsync(ret.data(), + input.data(), + SizeOfDataType(input.dtype()) * input.numel(), + cudaMemcpyHostToDevice, + nullptr); + return ret; + } +} +#endif + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index 6cfcc6f91be05938952c41812c1ee3fff4456075..75eab19396fb413b222b1259982b1e1af9b62311 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -18,6 +18,7 @@ #include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/host_context/kernel_utils.h" #include "paddle/infrt/tensor/phi/tensor_map.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { @@ -55,6 +56,12 @@ infrt::phi::DenseTensorMap LoadParams( int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map); +#ifdef INFRT_WITH_GPU +::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input, + const ::phi::GPUContext& context, + bool d2h); +#endif + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc index 75e3ebbf00ca54ed3fb2d0ca22bb7819300d0b2b..2e40261f27386717deee886494ef047c2f7166d7 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc @@ -14,6 +14,7 @@ #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/meta_tensor.h" namespace infrt { namespace kernel { @@ -31,6 +32,10 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape( infershape_kernel_frame_builder.AddArgument(value); } } + if (infershape_kernel_frame_builder.GetNumArgs() < arg_size_) { + infershape_kernel_frame_builder.AddArgument( + new host_context::Value(::phi::MetaConfig())); + } } void InferShapedKernelLauncher::BuildInferShapeCache( diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h index 380b45ea5be09903a7d48e436bb9cc8122df7959..770078115321bd1981a3958ac3c63a0e4dc9bdd3 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h +++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h @@ -22,11 +22,8 @@ namespace infrt { namespace kernel { struct InferShapedKernelLauncher { - virtual void Invoke(host_context::KernelFrame* frame) = 0; - - virtual ~InferShapedKernelLauncher() = default; - - protected: + explicit InferShapedKernelLauncher(int arg_size) : arg_size_(arg_size) {} + ~InferShapedKernelLauncher() = default; //! Initialize the kernel frame for InferShape kernel. // This method will create a new KernelFrame with all the Tensors(currently // only DenseHostTensor) converted into MetaTensors so that the infer-shape @@ -46,6 +43,7 @@ struct InferShapedKernelLauncher { llvm::SmallVector values; llvm::SmallVector<::phi::DDim, 3> tensor_shape_cache; host_context::KernelFrameBuilder infershape_kernel_frame_builder; + const int arg_size_; }; } // namespace kernel diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h index 75c9e554778dcf1488289c6e9e46fb9652f677dd..2dab7f2324d756967c891a214d1c11d186a2b8e9 100644 --- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h +++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h @@ -24,46 +24,44 @@ namespace infrt { namespace kernel { +template +struct FuncArgStatics {}; + +template +struct FuncArgStatics { + constexpr static int arg_size = sizeof...(Args); +}; + template -class KernelLauncher : public InferShapedKernelLauncher { - public: +void KernelLauncherFunc(host_context::KernelFrame* frame) { + static InferShapedKernelLauncher launcher( + FuncArgStatics::arg_size); static const uint16_t num_input_tensors{InferShapeHelper::count}; static const bool turn_on_infer_shape_cache{true}; - void Invoke(host_context::KernelFrame* frame) override { + #ifndef NDEBUG - LOG(INFO) << "Kernel.frame: " << frame->DumpArgTypes(); + LOG(INFO) << "Kernel.frame: " << frame->DumpArgTypes(); #endif - // Build the infershape KernelFrame if needed. - // TODO(Superjomn) add unlikely here. - if (infershape_kernel_frame_builder.IsEmpty()) { - CreateKernelFrameForInferShape(frame); + // Build the infershape KernelFrame if needed. + // TODO(Superjomn) add unlikely here. + if (launcher.infershape_kernel_frame_builder.IsEmpty()) { + launcher.CreateKernelFrameForInferShape(frame); #ifndef NDEBUG - LOG(INFO) << "infershape.frame: " - << infershape_kernel_frame_builder.DumpArgTypes(); + LOG(INFO) << "infershape.frame: " + << launcher.infershape_kernel_frame_builder.DumpArgTypes(); #endif + } + if (turn_on_infer_shape_cache) { + if (launcher.IsShapeChanged(num_input_tensors)) { + ::infrt::host_context::KernelImpl::Invoke( + &launcher.infershape_kernel_frame_builder); + launcher.BuildInferShapeCache(num_input_tensors); } - if (turn_on_infer_shape_cache) { - if (!turn_on_infer_shape_cache || IsShapeChanged(num_input_tensors)) { - ::infrt::host_context::KernelImpl::Invoke( - &infershape_kernel_frame_builder); - BuildInferShapeCache(num_input_tensors); - } - } - ::infrt::host_context::KernelImpl::Invoke(frame); } -}; - -template -void KernelLauncherFunc( - KernelLauncher launcher, - host_context::KernelFrame* frame) { - launcher.Invoke(frame); + ::infrt::host_context::KernelImpl::Invoke(frame); } } // namespace kernel diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 08683d7cb66ad434d4ed52c057eb0c9f4faef6f6..3b437a439fc3f34cc8987202958aa568703ce9c6 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -52,6 +52,9 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { "phi_dt.create_dense_tensor.gpu", INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor), {"dims", "lod", "layout", "precision"}); + registry->AddKernelWithAttrs("phi_dt.memcpy.gpu", + INFRT_KERNEL(infrt::kernel::phi::GpuMemCpy), + {"d2h"}); #endif registry->AddKernelWithAttrs("phi_dt.load_params", INFRT_KERNEL(infrt::kernel::phi::LoadParams), diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc index aa7609092b82c8ab08b75bfbd3e252801cc79c7d..2f73c6b13f40d60095f58dc2ddb2aa096cfbbe6f 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -14,6 +14,7 @@ #include "paddle/infrt/kernel/tensorrt/trt_kernels.h" #include +#include #include "NvInfer.h" #include "NvInferRuntime.h" #include "NvInferRuntimeCommon.h" @@ -68,7 +69,7 @@ namespace tensorrt { auto& region = operation.getRegion(0); auto& block = region.getBlocks().front(); - std::unordered_map trt_bind_inputs; + std::unordered_map trt_bind_inputs; ValueToITensorMap value_to_trt_tensor_map; ValueToTensorMap value_to_tensor_map; @@ -79,7 +80,7 @@ namespace tensorrt { const std::string input_name = "input_" + std::to_string(idx); auto* v = symbol_table->GetValue(std::to_string(idx)); CHECK_NOTNULL(v); - auto* t = &v->get(); + auto* t = &v->get<::phi::DenseTensor>(); value_to_tensor_map[operand] = t; // TODO(wilber): get input info from mlir. @@ -93,7 +94,7 @@ namespace tensorrt { if (operand.isa()) { // TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU // tensor, so we treat all GPU tensors as inputs to trt. - if (t->place().GetType() == phi::AllocationType::GPU) { + if (t->place().GetType() == ::phi::AllocationType::GPU) { trt_bind_inputs[input_name] = t; nvinfer1::Dims dims; dims.nbDims = t->dims().size() - 1; @@ -106,8 +107,10 @@ namespace tensorrt { } } else { // TODO(wilber): Replace with the op name that generates the weights. - if (operand.getDefiningOp()->getName().getStringRef() != - "phi_dt.create_dense_tensor.cpu") { + std::unordered_set weight_flags{ + "phi_dt.tensor_map_get_tensor", "phi_dt.create_dense_tensor.cpu"}; + if (!weight_flags.count( + operand.getDefiningOp()->getName().getStringRef().str())) { trt_bind_inputs[input_name] = t; nvinfer1::Dims dims; dims.nbDims = t->dims().size() - 1; @@ -167,10 +170,10 @@ void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) { engine->GetEngineInfo(); } -std::vector TrtEngineCompute( - backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) { +std::vector<::phi::DenseTensor*> TrtEngineCompute( + backends::tensorrt::TrtEngine* engine, const ::phi::GPUContext& context) { engine->Run(context); - std::vector res; + std::vector<::phi::DenseTensor*> res; for (size_t i = 0; i < engine->GetOutputNum(); ++i) { res.push_back(engine->GetOutput("output_" + std::to_string(i))); } diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h index 546ee9dc78852e6967bf8b61ae81563d32beae66..bf23bd45c13415782e29dfd45713fea3a2a0acb0 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.h +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h @@ -41,8 +41,8 @@ struct MlirOperationWithInfrtSymbol { void PrintTrtLayer(backends::tensorrt::TrtEngine* engine); -std::vector TrtEngineCompute( - backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context); +std::vector<::phi::DenseTensor*> TrtEngineCompute( + backends::tensorrt::TrtEngine* engine, const ::phi::GPUContext& context); } // namespace tensorrt } // namespace kernel diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt index 6f839cdc3954939e8c8d4792facac5a284d25f3f..3c4a2f1cbb8d3b978fbeffeb7d587d255c833484 100644 --- a/paddle/infrt/tests/CMakeLists.txt +++ b/paddle/infrt/tests/CMakeLists.txt @@ -7,3 +7,4 @@ add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir) diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir index 21ee8ebf0b705894446192b0d5d0bfeb9f10f326..d1e561cd5f995999b4400d6569bcd3fcad9aea0f 100644 --- a/paddle/infrt/tests/dialect/phi/phi_test.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir @@ -1,14 +1,32 @@ // RUN: infrtexec -i %s module { - func @predict(%arg0: !infrt.dense_tensor) -> !infrt.dense_tensor { + func @predict(%arg0: !infrt.dense_tensor,%filter: !infrt.dense_tensor, %arg1: !infrt.dense_tensor, %arg2: !infrt.dense_tensor, %arg3: !infrt.dense_tensor, %arg4: !infrt.dense_tensor) -> !infrt.dense_tensor { %2 = "pd.abs"(%arg0) : (!infrt.dense_tensor) -> !infrt.dense_tensor - infrt.return %2 : !infrt.dense_tensor + %3 = "pd.matmul_v2"(%arg0, %2) {trans_x = false, trans_y = false} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %4 = "pd.conv2d"(%3, %filter) {data_format = "NCHW", dilations = [1 : i32, 1 : i32], groups = 1 : si32, padding_algorithm = "EXPLICIT", paddings = [1 : i32, 1 : i32], strides = [2 : i32, 2 : i32]} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %Y, %MeanOut, %VarianceOut = "pd.batch_norm"(%4, %arg1, %arg2, %arg3, %arg4) {data_layout = "NCHW", epsilon = 9.99999974E-6 : f32, momentum = 0.899999976 : f32} : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) + %out = "pd.relu"(%Y) : (!infrt.dense_tensor) -> !infrt.dense_tensor + %5 = "pd.elementwise_add"(%out, %out) {axis = -1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + infrt.return %5 : !infrt.dense_tensor } func @main() { %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context - %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1], dims=[1, 3, 8, 8]}: (!phi.context) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () - %2 = infrt.call@predict(%t) : (!infrt.dense_tensor) -> !infrt.dense_tensor + %filter = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1], dims=[3, 3, 8, 8]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%filter) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () + %bias = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1], dims=[3]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%bias) {value=[1.5:f32]} : (!infrt.dense_tensor) -> () + %mean = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1], dims=[3]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%mean) {value=[3.5:f32]} : (!infrt.dense_tensor) -> () + %scale = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1], dims=[3]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%scale) {value=[1.0:f32]} : (!infrt.dense_tensor) -> () + %var = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1], dims=[3]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%var) {value=[0.0:f32]} : (!infrt.dense_tensor) -> () + + %2 = infrt.call@predict(%t, %filter, %bias, %mean, %scale, %var) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor,!infrt.dense_tensor,!infrt.dense_tensor,!infrt.dense_tensor) -> !infrt.dense_tensor + + //phi_dt.print_tensor(%t : !infrt.dense_tensor) phi_dt.print_tensor(%2 : !infrt.dense_tensor) infrt.return } diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_linear.mlir.in b/paddle/infrt/tests/dialect/tensorrt/disabled_linear.mlir.in new file mode 100644 index 0000000000000000000000000000000000000000..74a7de4335065f037cd75af5a901acb0f3aefc5b --- /dev/null +++ b/paddle/infrt/tests/dialect/tensorrt/disabled_linear.mlir.in @@ -0,0 +1,33 @@ +module { + func @main_graph(%map: !phi.dense_tensor_map, %arg0: !infrt.dense_tensor) -> !infrt.dense_tensor { + %0 = "phi_dt.create_context.gpu"() : () -> !phi.context + %1 = "phi_dt.memcpy.gpu"(%arg0, %0) {d2h = false} : (!infrt.dense_tensor, !phi.context) -> !infrt.dense_tensor + + %3 = phi_dt.tensor_map_get_tensor(%map) {name = "linear_0.b_0"} -> !infrt.dense_tensor + %4 = phi_dt.tensor_map_get_tensor(%map) {name = "linear_0.w_0"} -> !infrt.dense_tensor + %5 = "trt.create_engine"(%1, %4, %3) ( { + %10 = "trt.FullyConnected"(%1, %4, %3) {out_channel_num = 10 : si32} : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + infrt.return %10 : !infrt.dense_tensor + }) {run_once = true} : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !trt.engine + %6 = "trt.compute"(%5, %0) : (!trt.engine, !phi.context) -> !infrt.tensor_list + %7 = "dt.tensor_list_get_tensor"(%6) {id = 0 : i32} : (!infrt.tensor_list) -> !infrt.dense_tensor + %8 = "phi_dt.memcpy.gpu"(%7, %0) {d2h = true} : (!infrt.dense_tensor, !phi.context) -> !infrt.dense_tensor + infrt.return %8 : !infrt.dense_tensor + } + + func @main() { + %map = phi_dt.load_combined_params(){model_path="@CMAKE_BINARY_DIR@/linear/linear.pdmodel", + params_path="@CMAKE_BINARY_DIR@/linear/linear.pdiparams"} + + %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context + %input_tensor = "phi_dt.create_dense_tensor.cpu" (%ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[3:i64, 784:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + + %res = infrt.call @main_graph(%map, %input_tensor) {} : (!phi.dense_tensor_map, !infrt.dense_tensor) -> !infrt.dense_tensor + "phi_dt.print_tensor" (%res) : (!infrt.dense_tensor) -> () + infrt.return + } +} diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index b881b5bac21ca81a00a1d0bbe12b4ac9592ee6b0..934c89fc927e8cfcec1b3af717172e7aabfca4a2 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -518,6 +518,30 @@ class PADDLE_API Tensor final { /* Part 10: Auto generated Tensor methods */ + /* Part 11: Methods of converting SparseTensor and DenseTensor to each other + */ + /** + * @brief Convert DenseTensor or SparseCsrTensor to SparseCooTensor + * + * @param sparse_dim, The number of sparse dimensions + * @return Tensor + */ + Tensor to_sparse_coo(const int64_t sparse_dim) const; + + /** + * @brief Convert DenseTensor or SparseCooTensor to SparseCsrTensor + * + * @return Tensor + */ + Tensor to_sparse_csr() const; + + /** + * @brief Convert SparseCooTensor or SparseCsrTensor to DenseTensor + * + * @return Tensor + */ + Tensor to_dense() const; + private: /** * [ Why use abstract TensorImpl interface here? ] diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 50c267f653564ebee770c058fdf5fb3af14e9c23..90bea6d98025c7c581033d22df44e84e5509db49 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -149,4 +149,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl) -cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta) +cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api) diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index c502747c4f9fe6f67d027f82085074d06142fbfb..dde9980d0b951421c3c69b8a9b0506e56939af7b 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/tensor_base.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/infermeta/unary.h" @@ -183,5 +184,17 @@ void Tensor::copy_(const Tensor &src, } } +Tensor Tensor::to_sparse_coo(const int64_t sparse_dim) const { + return experimental::sparse::to_sparse_coo(*this, sparse_dim); +} + +Tensor Tensor::to_sparse_csr() const { + return experimental::sparse::to_sparse_csr(*this); +} + +Tensor Tensor::to_dense() const { + return experimental::sparse::to_dense(*this); +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/backends/callback_manager.cc b/paddle/phi/backends/callback_manager.cc index e21e8502d8f8c43e7484982354c4ea69253a195f..4a958ef73bfc67d73bcf73f7d50d224beb6b8ae4 100644 --- a/paddle/phi/backends/callback_manager.cc +++ b/paddle/phi/backends/callback_manager.cc @@ -16,16 +16,18 @@ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" +#include + namespace phi { CallbackManager::CallbackManager(stream::Stream *stream) - : stream_(stream), thread_pool_(1) {} + : stream_(stream), thread_pool_(new ::ThreadPool(1)) {} void CallbackManager::AddCallback(std::function callback) const { auto *callback_func = new std::function(std::move(callback)); auto *func = new std::function([this, callback_func] { std::lock_guard lock(mtx_); - last_future_ = thread_pool_.enqueue([callback_func] { + last_future_ = thread_pool_->enqueue([callback_func] { std::unique_ptr> releaser(callback_func); (*callback_func)(); }); diff --git a/paddle/phi/backends/callback_manager.h b/paddle/phi/backends/callback_manager.h index 359958b7c93e2c4041532a377f35836ca8ae89bc..2bb26745288dfebf7cb669e631b691c490fcbfd6 100644 --- a/paddle/phi/backends/callback_manager.h +++ b/paddle/phi/backends/callback_manager.h @@ -14,8 +14,6 @@ #pragma once -#include - #ifdef PADDLE_WITH_CUDA #include #include @@ -30,6 +28,8 @@ #include #include // NOLINT +class ThreadPool; + namespace phi { namespace stream { @@ -50,7 +50,7 @@ class CallbackManager { private: stream::Stream* stream_; - mutable ::ThreadPool thread_pool_; + mutable std::shared_ptr<::ThreadPool> thread_pool_; mutable std::mutex mtx_; mutable std::future last_future_; }; diff --git a/paddle/phi/backends/device_base.h b/paddle/phi/backends/device_base.h index b4964708dfb9797c75e6f69ccb8bae6853b424a9..8cc6e498068fa65d697f6f002bec17b075b42866 100644 --- a/paddle/phi/backends/device_base.h +++ b/paddle/phi/backends/device_base.h @@ -14,6 +14,8 @@ #pragma once #ifdef PADDLE_WITH_CUSTOM_DEVICE +#include + #include "paddle/phi/backends/event.h" #include "paddle/phi/backends/stream.h" diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 613a2f9960a6ffd2ca4a02f20710018fcc00eaed..d9cff03e89ca212a4bdbde84dbc031ca68f8be6f 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -124,6 +124,10 @@ class OpUtilsMap { {std::move(op_type), std::move(base_kernel_name)}); } + bool HasArgumentMappingFn(const std::string& op_type) const { + return arg_mapping_fn_map_.count(op_type); + } + void InsertArgumentMappingFn(std::string op_type, ArgumentMappingFn fn) { PADDLE_ENFORCE_EQ( arg_mapping_fn_map_.count(op_type), diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 4790fa863f272b6defbede1ce54de848175371a1..3aa497606260a1fdb46f6bdb9fecbbda88bb2cac 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -832,6 +832,50 @@ void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { out->share_lod(*x.at(0)); } +void MultiplexInferMeta(const std::vector& ins, + const MetaTensor& ids, + MetaTensor* out) { + PADDLE_ENFORCE_NE( + ins.empty(), + true, + phi::errors::InvalidArgument("MultiInput(X) shouldn't be empty.")); + auto ids_dim = ids.dims(); + PADDLE_ENFORCE_EQ(ids_dim.size(), + 2, + phi::errors::PreconditionNotMet( + "The index tensor must be a vector with 2 dimensions")); + PADDLE_ENFORCE_EQ( + ids_dim[1], + 1, + phi::errors::PreconditionNotMet( + "The index tensor must be a vector with batchSize x 1.")); + + auto ins_dims = GetMetaTensorsDim(ins); + auto num_ins = ins_dims.size(); + PADDLE_ENFORCE_GT( + num_ins, + 1, + phi::errors::InvalidArgument("multiplex operator should have more than " + "one candidate input tensors.")); + + auto in_dim = ins_dims[0]; + PADDLE_ENFORCE_GE( + in_dim.size(), + 2, + phi::errors::InvalidArgument( + "The rank of candidate tensors must be not less than 2.")); + for (size_t i = 1; i < num_ins; i++) { + auto dim = ins_dims[i]; + PADDLE_ENFORCE_EQ( + in_dim, + dim, + phi::errors::PreconditionNotMet( + "All the candidate tensors must have the same size.")); + } + out->set_dims(in_dim); + out->set_dtype(ins[0]->dtype()); +} + void PsroiPoolInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 9088f20481286e0046f3aba9744fbd976cb917e2..ddd7c132fbde78fb5c906b9064c3b09014bc12a3 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -152,6 +152,10 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x, void MultiDotInferMeta(const std::vector& x, MetaTensor* out); +void MultiplexInferMeta(const std::vector& ins, + const MetaTensor& ids, + MetaTensor* out); + void PsroiPoolInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 160e8ef56f3894332a1c3318e37907fe8a821154..b76661d49bd49fa8e2672126689604356e8ea9c0 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/kernels/funcs/parse_qr_mode.h" #include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" @@ -1129,6 +1130,44 @@ void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_layout(x.layout()); } +void QrInferMeta(const MetaTensor& x, + const std::string& mode, + MetaTensor* q, + MetaTensor* r) { + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument("the rank of input must greater than 2")); + bool compute_q; + bool reduced_mode; + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + std::tie(compute_q, reduced_mode) = phi::funcs::ParseQrMode(mode); + + if (compute_q) { + int k = reduced_mode ? min_mn : m; + auto q_dims_vec = phi::vectorize(x_dims); + q_dims_vec[q_dims_vec.size() - 1] = k; + q->set_dims(phi::make_ddim(q_dims_vec)); + } else { + q->set_dims(phi::make_ddim({0})); + } + + int k = reduced_mode ? min_mn : m; + auto r_dims_vec = phi::vectorize(x_dims); + r_dims_vec[r_dims_vec.size() - 2] = k; + r_dims_vec[r_dims_vec.size() - 1] = n; + r->set_dims(phi::make_ddim(r_dims_vec)); + + q->share_lod(x); + r->share_lod(x); + q->set_dtype(x.dtype()); + r->set_dtype(x.dtype()); +} + DDim ReduceInferDim(const MetaTensor& x, const std::vector& axis, bool keep_dim, @@ -1847,6 +1886,20 @@ void UnbindInferMeta(const MetaTensor& x, } } +void TrilTriuInferMeta(const MetaTensor& x, + int diagonal, + bool lower, + MetaTensor* out) { + const auto& x_dims = x.dims(); + PADDLE_ENFORCE_GE(x_dims.size(), + 2, + phi::errors::InvalidArgument( + "Input(X)'s rank must be at least 2 in TrilTriuOp.")); + out->set_dims(x.dims()); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) { out->share_meta(x); } diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 6187c49de1bfd6999403e0f5cf2626d04029cd41..8e254965ab8da6b57c6fbf5b98f33ced0c08170a 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -180,6 +180,11 @@ void PoolInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void QrInferMeta(const MetaTensor& x, + const std::string& mode, + MetaTensor* q, + MetaTensor* r); + void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); void ReduceInferMeta(const MetaTensor& x, @@ -282,6 +287,11 @@ void TransposeGradInferMeta(const MetaTensor& x, const std::vector& axis, MetaTensor* out); +void TrilTriuInferMeta(const MetaTensor& x, + int diagonal, + bool lower, + MetaTensor* out); + void UnbindInferMeta(const MetaTensor& x, int axis, std::vector* outs); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 941ede31400bf6da960425a3b16b3f8576175452..0f77420809c6f6619f087e01826e583e38e702de 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -62,6 +62,8 @@ register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS $ # phi sparse kernels add_subdirectory(sparse) +# phi selected_rows kernels +add_subdirectory(selected_rows) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) diff --git a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc index c68e8115e898b3701b9f568ac501260615b69ad4..aba519ff04849a54bfe1a69a6381f4298822279f 100644 --- a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc @@ -45,3 +45,17 @@ PD_REGISTER_KERNEL(matmul_triple_grad, double, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(matmul_with_flatten_grad, + CPU, + ALL_LAYOUT, + phi::MatmulWithFlattenGradKernel, + float, + double) {} + +PD_REGISTER_KERNEL(matmul_with_flatten_double_grad, + CPU, + ALL_LAYOUT, + phi::MatmulWithFlattenDoubleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/matmul_kernel.cc b/paddle/phi/kernels/cpu/matmul_kernel.cc index 2bf56c07a5bc7485fd29d6ac347a5311915d8f36..8aa25c0da07d9617d1734647d511e3707e60ebc3 100644 --- a/paddle/phi/kernels/cpu/matmul_kernel.cc +++ b/paddle/phi/kernels/cpu/matmul_kernel.cc @@ -28,3 +28,10 @@ PD_REGISTER_KERNEL(matmul, double, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(matmul_with_flatten, + CPU, + ALL_LAYOUT, + phi::MatmulWithFlattenKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc index e2e32567441ae8ff5315856e3f9132c9553f6d62..b0e82cedb6b8b88e04d6e2128b6c3aa438901996 100644 --- a/paddle/phi/kernels/cpu/qr_kernel.cc +++ b/paddle/phi/kernels/cpu/qr_kernel.cc @@ -19,30 +19,10 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/parse_qr_mode.h" namespace phi { -static inline std::tuple ParseQrMode(const std::string& mode) { - bool compute_q; - bool reduced; - if (mode == "reduced") { - compute_q = true; - reduced = true; - } else if (mode == "complete") { - compute_q = true; - reduced = false; - } else if (mode == "r") { - compute_q = false; - reduced = true; - } else { - PADDLE_THROW(errors::InvalidArgument( - "QR received unrecognized mode '%s'" - " but expected one of 'reduced' (default), 'r', or 'complete'", - mode)); - } - return std::make_tuple(compute_q, reduced); -} - template void QrKernel(const Context& ctx, const DenseTensor& x, @@ -51,7 +31,7 @@ void QrKernel(const Context& ctx, DenseTensor* r) { bool compute_q; bool reduced_mode; - std::tie(compute_q, reduced_mode) = ParseQrMode(mode); + std::tie(compute_q, reduced_mode) = phi::funcs::ParseQrMode(mode); auto numel = x.numel(); PADDLE_ENFORCE_GT( numel, 0, errors::PreconditionNotMet("The input of QR is empty.")); diff --git a/paddle/phi/kernels/funcs/parse_qr_mode.h b/paddle/phi/kernels/funcs/parse_qr_mode.h new file mode 100644 index 0000000000000000000000000000000000000000..adf64759d3ad60307048cdd18f0961cb5ecb4cdf --- /dev/null +++ b/paddle/phi/kernels/funcs/parse_qr_mode.h @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +static inline std::tuple ParseQrMode(const std::string& mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; + } else { + PADDLE_THROW(errors::InvalidArgument( + "QR received unrecognized mode '%s'" + " but expected one of 'reduced' (default), 'r', or 'complete'", + mode)); + } + return std::make_tuple(compute_q, reduced); +} +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 85c371e9f9d450c55741b901eff6f102fa6c3f6f..17f5cd67ec95712c8b45596d3637b9b0f5f936e8 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -453,25 +453,20 @@ struct ReduceConfig { void SetReduceType() { int rank = x_dim.size(); int reduce_rank = reduce_dim.size(); - bool is_last_dim = - (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); - if (rank == reduce_rank || is_last_dim) { #ifdef PADDLE_WITH_XPU_KP - reduce_type = static_cast(ReduceType::kReduceAny); + bool not_higher = x_dim[0] > 1; #else - reduce_type = static_cast(ReduceType::kReduceLastDim); + int device_id = paddle::platform::GetCurrentDeviceId(); + int max_grid_z = phi::backends::gpu::GetGpuMaxGridDimSize(device_id)[2]; + bool not_higher = x_dim[0] >= max_grid_z; #endif + if (reduce_last_dim && (reduce_rank == 1)) { + reduce_type = static_cast(ReduceType::kReduceLastDim); } else if (reduce_rank == 1) { -// ReduceFirstDim and reduceSecondDim -#ifdef PADDLE_WITH_XPU_KP - if (reduce_dim[0] == 0) { - reduce_type = static_cast(ReduceType::kReduceHigherDim); - } else { + reduce_type = static_cast(ReduceType::kReduceHigherDim); + if (rank == 3 && not_higher) { reduce_type = static_cast(ReduceType::kReduceAny); } -#else - reduce_type = static_cast(ReduceType::kReduceHigherDim); -#endif } else { reduce_type = static_cast(ReduceType::kReduceAny); } @@ -648,7 +643,8 @@ __global__ void ReduceAnyKernel(const Tx* x, bool reduce_last_dim, const Calculator reduce_index_calculator, const Calculator left_index_calculator, - const kps::DimConfig dim) { + const kps::DimConfig dim, + bool is_mean) { int input_idx, left_idx, stride; int block_size = 0; bool need_store = true; @@ -752,7 +748,9 @@ __global__ void ReduceAnyKernel(const Tx* x, kps::Reduce( &reduce_var, &reduce_var, reducer, reduce_last_dim); - + if (is_mean) { + reduce_var = reduce_var / static_cast(reduce_num); + } Ty result = static_cast(reduce_var); kps::details::WriteData( y + store_offset + i, &result, static_cast(need_store)); @@ -772,7 +770,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x, int reduce_num, int left_num, int blocking_size, - const kps::DimConfig dim) { + const kps::DimConfig dim, + int mean_div, + bool is_mean) { // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this // function will be used auto block = ReduceIndexMapping(dim); @@ -806,6 +806,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x, kps::details::ReduceMode::kLocalMode>( &reduce_var, &reduce_compute, reducer, false); } + if (is_mean) { + reduce_var = reduce_var / static_cast(mean_div); + } Ty result = static_cast(reduce_var); kps::WriteData( y + store_offset + idx, &result, block.BlockDimX()); @@ -831,6 +834,10 @@ __global__ void ReduceHigherDimKernel(const Tx* x, kps::details::ReduceMode::kLocalMode>( &reduce_var, &reduce_compute, reducer, false); } + + if (is_mean) { + reduce_var = reduce_var / static_cast(mean_div); + } Ty result = static_cast(reduce_var); kps::WriteData( y + store_offset + idx, &result, dim.rem_x); @@ -848,7 +855,8 @@ static void LaunchReduceKernel(const Tx* x_data, const TransformOp& transform, MPType init, KPStream stream, - ReduceConfig config) { + ReduceConfig config, + bool is_mean = false) { if (config.reduce_type == kReduceLastDim) { int stride_reduce = 1; int stride_left = config.reduce_num; @@ -887,7 +895,8 @@ static void LaunchReduceKernel(const Tx* x_data, config.reduce_last_dim, reduce_index_calculator, left_index_calculator, - dim); + dim, + is_mean && (!config.should_reduce_again)); } else { int reduce_rank = config.reduce_strides.size(); @@ -930,7 +939,8 @@ static void LaunchReduceKernel(const Tx* x_data, config.reduce_last_dim, reduce_index_calculator, left_index_calculator, - dim); + dim, + is_mean && (!config.should_reduce_again)); } if (config.should_reduce_again) { @@ -950,15 +960,18 @@ static void LaunchReduceKernel(const Tx* x_data, kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); dim.SetRem(config.left_num % block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - grid = 8; - block = 64; + int grid_size = 8; + int block_size = 64; +#else + auto grid_size = grid; + auto block_size = block; #endif ReduceHigherDimKernel< Ty, Ty, MPType, ReduceOp, - kps::IdentityFunctor><<>>( + kps::IdentityFunctor><<>>( config.output_data, y_data, reducer, @@ -967,7 +980,9 @@ static void LaunchReduceKernel(const Tx* x_data, config.grid.y, config.left_num, config.grid.y, - dim); + dim, + config.reduce_num, + is_mean); } } @@ -1034,7 +1049,8 @@ void ReduceKernel(const KPDevice& dev_ctx, const phi::DenseTensor& x, phi::DenseTensor* y, const TransformOp& transform, - const std::vector& origin_reduce_dims) { + const std::vector& origin_reduce_dims, + bool is_mean = false) { #ifdef PADDLE_WITH_XPU_KP auto stream = dev_ctx.x_context()->xpu_stream; #else @@ -1069,8 +1085,18 @@ void ReduceKernel(const KPDevice& dev_ctx, bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; #ifndef PADDLE_WITH_XPU_KP if (use_cub_reduce) { - CubTensorReduceImpl( - x_data, y_data, transform, config.reduce_num, dev_ctx, stream); + if (is_mean) { + using Div = kps::DivideFunctor; + CubTensorReduceImpl(x_data, + y_data, + Div(config.reduce_num), + config.reduce_num, + dev_ctx, + stream); + } else { + CubTensorReduceImpl( + x_data, y_data, transform, config.reduce_num, dev_ctx, stream); + } return; } #endif @@ -1115,7 +1141,9 @@ void ReduceKernel(const KPDevice& dev_ctx, config.reduce_num, config.left_num, config.blocking_size, - dim); + dim, + config.reduce_num, + is_mean && (!config.should_reduce_again)); if (config.should_reduce_again) { dim3 block = dim3(config.block.x, 1, 1); @@ -1125,15 +1153,19 @@ void ReduceKernel(const KPDevice& dev_ctx, dim2.SetRem(config.left_num % config.block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - grid = 8; - block = 64; + int grid_size = 8; + int block_size = 64; +#else + auto grid_size = grid; + auto block_size = block; #endif ReduceHigherDimKernel< Ty, Ty, MPType, ReduceOp, - kps::IdentityFunctor><<>>( + kps::IdentityFunctor><<>>( config.output_data, y_data, reducer, @@ -1142,7 +1174,9 @@ void ReduceKernel(const KPDevice& dev_ctx, config.grid.y, config.left_num, config.grid.y, - dim2); + dim2, + config.reduce_num, + is_mean); } return; } @@ -1151,7 +1185,14 @@ void ReduceKernel(const KPDevice& dev_ctx, // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // function will be used LaunchReduceKernel, TransformOp>( - x_data, y_data, reducer, transform, reducer.initial(), stream, config); + x_data, + y_data, + reducer, + transform, + reducer.initial(), + stream, + config, + is_mean); } } // namespace funcs diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu index ff23ebd05b52833eef9fd23efb1d8537d1013454..9c80d5e151c1c1bb01d27234c9d2ecf12a361d3b 100644 --- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu @@ -49,3 +49,19 @@ PD_REGISTER_KERNEL(matmul_triple_grad, phi::dtype::float16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(matmul_with_flatten_grad, + GPU, + ALL_LAYOUT, + phi::MatmulWithFlattenGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(matmul_with_flatten_double_grad, + GPU, + ALL_LAYOUT, + phi::MatmulWithFlattenDoubleGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu index 98be79c5f9dab5f1a72d7784dfbe1745d27bd622..20c9a5229aaa66dc7f4663117ef0c102cb41a12d 100644 --- a/paddle/phi/kernels/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/gpu/matmul_kernel.cu @@ -30,3 +30,11 @@ PD_REGISTER_KERNEL(matmul, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(matmul_with_flatten, + GPU, + ALL_LAYOUT, + phi::MatmulWithFlattenKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index da5315f34479f92bfb0e5d807e28882eafa3d2ac..e47b3afc9c355596382eb9d750394d5533c778e5 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -30,7 +30,8 @@ void Reduce(const KPDevice& dev_ctx, const std::vector& dims, bool keep_dim, DataType out_dtype, - DenseTensor* out) { + DenseTensor* out, + bool is_mean = false) { std::vector reduce_dims = phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all); @@ -57,12 +58,18 @@ void Reduce(const KPDevice& dev_ctx, tmp_tensor, out, TransformOp(reduce_num), - reduce_dims); + reduce_dims, + is_mean); })); } else { using MPType = typename kps::details::MPTypeTrait::Type; phi::funcs::ReduceKernel>( - dev_ctx, x, out, TransformOp(reduce_num), reduce_dims); + dev_ctx, + x, + out, + TransformOp(reduce_num), + reduce_dims, + is_mean); } } } // namespace phi diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu index 6cbe699e8e05831b049536b06b1fdadcc145537d..fabd13d4a737c3ab99bbb080b4d0275373d44a18 100644 --- a/paddle/phi/kernels/gpu/reduce_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_kernel.cu @@ -27,8 +27,8 @@ void MeanRawKernel(const Context& dev_ctx, bool reduce_all, DenseTensor* out) { auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out, true); } template diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index 495b93f2a4ef0f790d53605e4531af7040c6b2ad..25a9db868d35705330dbf70caee18328a457e46b 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -1731,4 +1731,163 @@ void MatmulTripleGradKernel(const Context& dev_ctx, } } +template +void MatmulWithFlattenGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int x_num_col_dims, + int y_num_col_dims, + DenseTensor* x_grad, + DenseTensor* y_grad) { + auto x_matrix = x.dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(x, x_num_col_dims) + : x; + auto y_matrix = y.dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(y, y_num_col_dims) + : y; + auto* dout = &out_grad; + + DenseTensor dout_mat(*dout); + dout_mat.Resize({phi::flatten_to_2d(x.dims(), x_num_col_dims)[0], + phi::flatten_to_2d(y.dims(), y_num_col_dims)[1]}); + + auto* dx = x_grad; + auto* dy = y_grad; + + if (dx != nullptr) { + dx->set_lod(x.lod()); + } + if (dy != nullptr) { + dy->set_lod(y.lod()); + } + + auto blas = phi::funcs::GetBlas(dev_ctx); + if (dx) { + dev_ctx.template Alloc(dx); + DenseTensor dx_matrix = + dx->dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(*dx, x_num_col_dims) + : *dx; + + // dx = dout * y'. dx: M x K, dout : M x N, y : K x N + blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix); + } + if (dy) { + dev_ctx.template Alloc(dy); + DenseTensor dy_matrix = + dy->dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(*dy, y_num_col_dims) + : *dy; + // dy = x' * dout. dy K x N, dout : M x N, x : M x K + blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix); + } +} + +template +void MatmulWithFlattenDoubleGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + paddle::optional x_grad_grad, + paddle::optional y_grad_grad, + int x_num_col_dims, + int y_num_col_dims, + DenseTensor* x_grad, + DenseTensor* y_grad, + DenseTensor* out_grad_grad) { + auto x_mat = x.dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(x, x_num_col_dims) + : x; + auto y_mat = y.dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(y, y_num_col_dims) + : y; + + const int m = phi::flatten_to_2d(x.dims(), x_num_col_dims)[0]; + const int n = phi::flatten_to_2d(y.dims(), y_num_col_dims)[1]; + + auto* dout = &out_grad; + DenseTensor dout_mat(*dout); + dout_mat.Resize({m, n}); + + auto* ddx = x_grad_grad.get_ptr(); + auto* ddy = y_grad_grad.get_ptr(); + + auto* dx = x_grad; + auto* dy = y_grad; + auto* ddout = out_grad_grad; + + DenseTensor ddout_mat; + if (ddout) { + ddout->set_lod(dout->lod()); + // allocate and reshape ddout + dev_ctx.template Alloc(ddout); + ddout_mat.ShareDataWith(*ddout); + ddout_mat.Resize({m, n}); + } + + auto blas = phi::funcs::GetBlas(dev_ctx); + // a flag to specify whether ddout value has been set, if flag + // is false, MatMul beta should be 0 to set ddout, if flag is + // true, MatMul beta should be 1 to add result to ddout. + bool ddout_flag = false; + if (ddx) { + auto ddx_mat = + ddx->dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(*ddx, x_num_col_dims) + : static_cast(*ddx); + + // dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N + if (dy) { + dy->set_lod(y.lod()); + // allocate and reshape dy + dev_ctx.template Alloc(dy); + DenseTensor dy_mat = + dy->dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(*dy, y_num_col_dims) + : *dy; + blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat); + } + // ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N + if (ddout) { + blas.MatMul(ddx_mat, + false, + y_mat, + false, + static_cast(1.0), + &ddout_mat, + static_cast(ddout_flag)); + ddout_flag = true; + } + } + if (ddy) { + auto ddy_mat = + ddy->dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(*ddy, y_num_col_dims) + : static_cast(*ddy); + // dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K + if (dx) { + dx->set_lod(x.lod()); + // allocate and reshape dx + dev_ctx.template Alloc(dx); + DenseTensor dx_mat = + dx->dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(*dx, x_num_col_dims) + : *dx; + blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat); + } + // ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N + if (ddout) { + blas.MatMul(x_mat, + false, + ddy_mat, + false, + static_cast(1.0), + &ddout_mat, + static_cast(ddout_flag)); + } + } +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h index f6136de5d8d0c3d04c83b0446abc82d0eeb11376..3201923e1b2c6f3dacf307153bdff97c3c20fa97 100644 --- a/paddle/phi/kernels/impl/matmul_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h @@ -506,4 +506,34 @@ void MatmulKernel(const Context& dev_ctx, MatMulFunction(dev_ctx, x, y, out, transpose_x, transpose_y); } +template +void MatmulWithFlattenKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int x_num_col_dims, + int y_num_col_dims, + DenseTensor* out) { + const DenseTensor x_matrix = + x.dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(x, x_num_col_dims) + : x; + const DenseTensor y_matrix = + y.dims().size() > 2 + ? paddle::framework::ReshapeToMatrix(y, y_num_col_dims) + : y; + + dev_ctx.template Alloc(out); + auto z_dim = out->dims(); + if (z_dim.size() != 2) { + out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); + } + + auto blas = phi::funcs::GetBlas(dev_ctx); + + blas.MatMul(x_matrix, y_matrix, out); + if (z_dim.size() != 2) { + out->Resize(z_dim); + } +} + } // namespace phi diff --git a/paddle/phi/kernels/matmul_grad_kernel.h b/paddle/phi/kernels/matmul_grad_kernel.h index 10452ff0b7903cfd017f0ebbd73d42df52579b84..41a835db46f71e46daa92783cecd502f46f72186 100644 --- a/paddle/phi/kernels/matmul_grad_kernel.h +++ b/paddle/phi/kernels/matmul_grad_kernel.h @@ -60,4 +60,28 @@ void MatmulTripleGradKernel(const Context& dev_ctx, DenseTensor* out_d_ddx, DenseTensor* out_d_ddy); +template +void MatmulWithFlattenGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int x_num_col_dims, + int y_num_col_dims, + DenseTensor* x_grad, + DenseTensor* y_grad); + +template +void MatmulWithFlattenDoubleGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + paddle::optional x_grad_grad, + paddle::optional y_grad_grad, + int x_num_col_dims, + int y_num_col_dims, + DenseTensor* x_grad, + DenseTensor* y_grad, + DenseTensor* out_grad_grad); + } // namespace phi diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h index b524b9e5863dcbcacaea11df9a96b71570312213..a4c4971499fdf713bff482e0a28001ae9e3c6957 100644 --- a/paddle/phi/kernels/matmul_kernel.h +++ b/paddle/phi/kernels/matmul_kernel.h @@ -29,6 +29,16 @@ void MatmulKernel(const Context& dev_ctx, bool transpose_y, DenseTensor* out); +// In order to be compatible with `mul` op in fluid, +// it is no longer used in 2.x API +template +void MatmulWithFlattenKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int x_num_col_dims, + int y_num_col_dims, + DenseTensor* out); + template DenseTensor Matmul(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/selected_rows/CMakeLists.txt b/paddle/phi/kernels/selected_rows/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e6c110c670b439c5f8300c7c7014ab8dce84c2a --- /dev/null +++ b/paddle/phi/kernels/selected_rows/CMakeLists.txt @@ -0,0 +1,3 @@ + +set(SELECTED_ROWS_KERNEL_DEPS dense_tensor selected_rows sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel) +register_kernels(DEPS ${SELECTED_ROWS_KERNEL_DEPS} SUB_DIR "selected_rows_kernel") diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/isfinite_kernel_impl.h similarity index 100% rename from paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h rename to paddle/phi/kernels/selected_rows/impl/isfinite_kernel_impl.h diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc index a507cdd0d866c3677d74956d1146139e6f2f92c2..630f6bcf8352b6e07a1b41851e9dd07f7c17be14 100644 --- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc +++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc @@ -19,7 +19,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #endif #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h" +#include "paddle/phi/kernels/selected_rows/impl/isfinite_kernel_impl.h" namespace phi { diff --git a/paddle/phi/ops/compat/mul_sig.cc b/paddle/phi/ops/compat/mul_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..8770db1039eb6d38ca36d0cd7d5ac1711eb12f21 --- /dev/null +++ b/paddle/phi/ops/compat/mul_sig.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MulGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("matmul_with_flatten_grad", + {"X", "Y", GradVarName("Out")}, + {"x_num_col_dims", "y_num_col_dims"}, + {GradVarName("X"), GradVarName("Y")}); +} + +KernelSignature MulDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("matmul_with_flatten_double_grad", + {"X", "Y", "DOut", "DDX", "DDY"}, + {"x_num_col_dims", "y_num_col_dims"}, + {"DX", "DY", "DDOut"}); +} + +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(mul, matmul_with_flatten); +PD_REGISTER_BASE_KERNEL_NAME(mul_grad, matmul_with_flatten_grad); +PD_REGISTER_BASE_KERNEL_NAME(mul_grad_grad, matmul_with_flatten_double_grad); + +PD_REGISTER_ARG_MAPPING_FN(mul_grad, phi::MulGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mul_grad_grad, phi::MulDoubleGradOpArgumentMapping); diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 78a863040ade1a43e9de660bff59f5179535abef..b3500c5724bd2aefc353ba01d92e5deb0af14010 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -76,7 +76,7 @@ if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_ if not defined LOG_LEVEL set LOG_LEVEL=normal if not defined PRECISION_TEST set PRECISION_TEST=OFF if not defined NIGHTLY_MODE set PRECISION_TEST=OFF -if not defined retry_times set retry_times=3 +if not defined retry_times set retry_times=1 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 if not defined BUILD_DIR set BUILD_DIR=build set task_name=%1 @@ -234,7 +234,6 @@ set WITH_MKL=OFF set WITH_GPU=OFF set WITH_AVX=OFF set MSVC_STATIC_CRT=ON -set retry_times=1 set ON_INFER=OFF call :cmake || goto cmake_error @@ -267,7 +266,6 @@ rem ------Build windows avx whl package------ set WITH_AVX=ON set ON_INFER=OFF set CUDA_ARCH_NAME=All -set retry_times=4 call :cmake || goto cmake_error call :build || goto build_error @@ -279,7 +277,6 @@ rem ------Build windows no-avx whl package------ set WITH_AVX=OFF set ON_INFER=OFF set CUDA_ARCH_NAME=All -set retry_times=4 call :cmake || goto cmake_error call :build || goto build_error diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index bc19b50616d139e2a2db83ad51f602dff0f0fa7a..f4165d97685f1e6966a3cfd20162155c5399392f 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -209,6 +209,9 @@ function cmake_base() { -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} + -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} + -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} + -DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} @@ -262,6 +265,9 @@ EOF -DWITH_AVX=${WITH_AVX:-OFF} \ -DNOAVX_CORE_FILE=${NOAVX_CORE_FILE:-""} \ -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \ + -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} \ + -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} \ + -DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DCUDNN_ROOT=/usr/ \ -DWITH_TESTING=${WITH_TESTING:-ON} \ diff --git a/python/paddle/distributed/auto_parallel/cost/__init__.py b/python/paddle/distributed/auto_parallel/cost/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7bc8a81b79f8e493d726fb36c6cb6b21366b74c2 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/cost/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from .base_cost import OP_COST_FACTORY +from .base_cost import Cost +from .comm_op_cost import AllreduceSumCost +from .comp_op_cost import MatmulV2OpCost +from .tensor_cost import TensorCost +from .estimate_cost import CostEstimator diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/cost/base_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..c4ebd836129e262a1ab2ed940ab14449fae9f96e --- /dev/null +++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py @@ -0,0 +1,342 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from collections import OrderedDict +import paddle + +COMM_OP_TYPE = [ + "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum" +] +NON_COMP_TYPE = ["while"] + COMM_OP_TYPE +OP_COST_FACTORY = {} + + +def _parse_op_to_desc(op, dist_context=None): + desc = {} + desc["op"] = op.type + vars = op.block.vars + input_desc = OrderedDict() + for input_name in op.input_names: + var_name_list = op.input(input_name) + var_desc = [] + for var_name in var_name_list: + var = vars[var_name] + shape = None + if dist_context is not None: + dist_tensor = dist_context.get_dist_tensor_for_program(var) + shape = dist_tensor.local_sizes() + else: + shape = var.shape + assert shape is not None + var_desc.append((var.dtype, shape)) + input_desc[input_name] = var_desc + desc["inputs"] = input_desc + + output_desc = OrderedDict() + for out_name in op.output_names: + var_name_list = op.output(out_name) + var_desc = [] + for var_name in var_name_list: + var = vars[var_name] + shape = None + if dist_context is not None: + dist_tensor = dist_context.get_dist_tensor_for_program(var) + shape = dist_tensor.local_sizes() + else: + shape = var.shape + assert shape is not None + var_desc.append((var.dtype, shape)) + output_desc[out_name] = var_desc + desc["outputs"] = output_desc + + attr_desc = op.all_attrs + desc["attrs"] = attr_desc + + return desc + + +def parse_to_desc(op=None, dist_op=None, dist_context=None): + desc = None + if op is None and dist_op is not None and dist_context is not None: + desc = _parse_op_to_desc( + op=dist_op.serial_op, dist_context=dist_context) + elif op is not None and dist_op is None and dist_context is None: + desc = _parse_op_to_desc(op) + + return desc + + +def parse_desc_to_str(desc): + def _parse_dtype(dtype): + dtype_str = "" + if dtype == paddle.float32: + dtype_str = "float32" + elif dtype == paddle.float16: + dtype_str = "float16" + elif dtype == paddle.int32: + dtype_str = "int32" + elif dtype == paddle.int64: + dtype_str = "int64" + elif dtype == paddle.unit8: + dtype_str = "unit8" + else: + raise TypeError("Unsupported dtype {}".format(dtype)) + return dtype_str + + assert isinstance(desc, dict) + desc_str_list = [] + desc_str = None + dtype_str_list = [] + dims_list = [] + shape_list = [] + + desc_str_list.append(desc["op"]) + inputs = desc["inputs"] + for key, item in inputs.items(): + for dtype, shape in item: + dtype_str_list.append(_parse_dtype(dtype)) + shape_list += list(shape) + dims = len(shape) + dims_list.append(dims) + + dtype_str = "*".join(dtype_str_list) + dims_list = [str(item) for item in dims_list] + dims_str = "*".join(dims_list) + + shape_list = [str(item) for item in shape_list] + shape_str = "[" + ",".join(shape_list) + "]" + desc_str_list += [dtype_str, dims_str, shape_str] + desc_str = "_".join(desc_str_list) + + return desc_str + + +class CommContext: + _instance = None + _has_instance = False + + def __init__(self, cluster): + if CommContext._has_instance: + return + self.cluster = cluster + self._alpha_base_ring = 8.4 + self._alpha_base_tree = 0 + self._alpha_inter = None + self._alpha_intra + self._beta = {} + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls, *args, **kwargs) + _has_instance = True + return cls._instance + + @property + def alpha_inter(self): + if self._alpha_inter is None: + if cluster.alpha.inter == "NVL": + self._alpha_inter = 3.4 + elif cluster.alpha.inter == "PHB": + self._alpha_inter = 5.7 + return self._alpha_inter + + @property + def alpha_intra(self): + if self._alpha_intra is None: + if cluster.alpha.intra == "NVL": + self._alpha_intra = 28 + elif cluster.alpha.intra == "PHB": + self._alpha_intra = 28 + return self._alpha_intra + + @property + def alpha_base_ring(self): + return self._alpha_base_ring + + @property + def alpha_base_tree(self): + return self._alpha_base_tree + + def get_beta(self, ranks): + key = ','.join(map(str, sorted(ranks))) + max_beta = None + if key in self._beta.keys: + max_beta = self._beta[key] + else: + for i in range(len(ranks)): + for j in range(i + 1, len(ranks)): + if min_beta == None: + min_beta = cluster.get_beta(ranks[i], ranks[j]) + else: + beta = cluster.get_beta(ranks[i], ranks[j]) + if beta > max_beta: + max_beta = beta + self._beta[key] = max_beta + + return max_beta + + +class Cost: + def __init__(self, time=0, memory=0, flops=0): + self.time = time + self.memory = memory + self.flops = flops + + def _check_time(self, val): + assert val >= 0, "Time must be greater than or equal to 0." + + def _check_memory(self, val): + assert isinstance( + val, int) and val >= 0, "Memory must be int and greater than 0." + + def _check_flops(self, val): + assert isinstance( + val, int) and val >= 0, "FLOPs must be int and greater than 0." + + @property + def time(self): + return self._time + + @time.setter + def time(self, val): + self._check_time(val) + self._time = val + + @property + def memory(self): + return self._memory + + @memory.setter + def memory(self, val): + self._check_memory(val) + self._memory = val + + @property + def flops(self): + return self._flops + + @flops.setter + def flops(self, val): + self._check_flops(val) + self._flops = val + + def __add__(self, rhs): + assert isinstance(rhs, Cost) + time = self.time + rhs.time + memory = self.memory + rhs.memory + flops = self.flops + rhs.flops + assert (time >= 0 and memory >= 0 and flops >= 0) + return Cost(time, memory, flops) + + def __sub__(self, rhs): + assert isinstance(rhs, Cost) + time = self.time - rhs.time + memory = self.memory - rhs.memory + flops = self.flops - rhs.flops + assert (time >= 0 and memory >= 0 and flops >= 0) + return Cost(time, memory, flops) + + +class OpCost: + def __init__(self, op=None, op_desc=None): + assert (op is not None and op_desc is None) or (op is None and + op_desc is not None) + self._op = op + self._op_desc = op_desc + self._cost = self.calc_cost() + + @property + def op(self): + return self._op + + @property + def op_desc(self): + return self._op_desc + + @property + def cost(self): + return self._cost + + def calc_time(self): + return 0 + + def calc_memory(self): + return 0 + + def calc_flops(self): + return 0 + + def calc_cost(self): + time = self.calc_time() + memory = self.calc_memory() + flops = self.calc_flops() + cost = Cost(time, memory, flops) + return cost + + +class CommOpCost(OpCost): + OP_TYPE = "COMM" + + def __init__(self, op=None, op_desc=None, comm_context=None): + super(CommOpCost, self).__init__(op=op, op_desc=op_desc) + self._check_comm_op_type() + self._comm_context = comm_context + + @property + def comm_context(self): + return self._comm_context + + @classmethod + def _check_comm_op_type(cls): + if cls.OP_TYPE != "COMM": + if cls.OP_TYPE not in COMM_OP_TYPE: + raise TypeError("Please Check op type in {}, but got {}.". + format(COMM_OP_TYPE, cls.OP_TYPE)) + + +class CompOpCost(OpCost): + OP_TYPE = "COMP" + + def __init__(self, op=None, op_desc=None, cluster=None): + super(CompOpCost, self).__init__(op=op, op_desc=op_desc) + self._check_comp_op_type() + self.cluster = cluster + + @classmethod + def _check_comp_op_type(cls): + if cls.OP_TYPE != "COMP": + if cls.OP_TYPE in NON_COMP_TYPE: + raise TypeError("Please Check op type not in {}, but got {}.". + format(NON_COMP_TYPE, cls.OP_TYPE)) + + +def register_op_cost(cls): + op_type = cls.OP_TYPE + + def register(op_type): + OP_COST_FACTORY[op_type] = cls + + return register(op_type) + + +def calc_time_from_model(op=None, desc=None, cluster=None, comm_context=None): + op_type = op.type if op is not None else desc["op"] + if op_type in COMM_OP_TYPE: + op_cost = OP_COST_FACTORY[op_type](op=op, + op_desc=desc, + comm_context=comm_context) + elif op_type not in NON_COMP_TYPE: + op_cost = OP_COST_FACTORY[op_type](op=op, op_desc=desc, cluster=cluster) + time = op_cost.calc_time() + return time diff --git a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..359f6b6e7862cca769cd8666c6513c640ef3ea05 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py @@ -0,0 +1,28 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from .base_cost import register_op_cost, CommOpCost, OP_COST_FACTORY + + +@register_op_cost +class AllreduceSumCost(CommOpCost): + OP_TYPE = "c_allreduce_sum" + + def __init__(self, op=None, op_desc=None, comm_context=None): + super(OP_COST_FACTORY["c_allreduce_sum"], self).__init__( + op=op, op_desc=op_desc, comm_context=comm_context) + + def calc_time(self): + # NOTE: The actual formula will be filled in the future. + return 0 diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..c4d88cb25dc1ee0c1b2e571f7e555b3297eb9db1 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py @@ -0,0 +1,33 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from .base_cost import Cost, register_op_cost, CompOpCost, OP_COST_FACTORY + + +@register_op_cost +class MatmulV2OpCost(CompOpCost): + OP_TYPE = "matmul_v2" + + def __init__(self, op=None, op_desc=None, cluster=None): + super(OP_COST_FACTORY["matmul_v2"], self).__init__( + op=op, op_desc=op_desc, cluster=cluster) + + # For a concrete COMP OP, the calc_time and calc_flops function needs to be overrided + def calc_flops(self): + # NOTE: The actual formula will be filled in the future + return 0 + + def calc_time(self): + # NOTE: The actual formula will be filled in the future + return 0 diff --git a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd535af8be97577afa20ccd24dd02a3f949c8ac --- /dev/null +++ b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py @@ -0,0 +1,69 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + + +class CostEstimator: + def __init__(self, + program, + cluster=None, + dist_context=None, + mode="modeling"): + self._program = program + self._cluster = cluster + self._dist_context = dist_context + self._check_mode(mode) + self._mode = mode + self._global_cost = None + self._local_cost = {} + + @property + def program(self): + return self._program + + @property + def dist_context(self): + return self._dist_context + + @property + def cluster(self): + return self._cluster + + @property + def mode(self): + return self._mode + + @property + def global_cost(self): + return self._global_cost + + @property + def local_cost(self): + return self._local_cost + + def get_op_cost(self): + return 0 + + def get_tensor_cost(self): + return 0 + + def get_global_cost(self): + return 0 + + def get_local_cost(self, rank=None): + return 0 + + def _check_mode(self, mode): + if mode not in ["modeling", "profiling"]: + raise ValueError( + "Just support modeling and profiling, but got {}".format(mode)) diff --git a/python/paddle/distributed/auto_parallel/cost/tensor_cost.py b/python/paddle/distributed/auto_parallel/cost/tensor_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..2db1c06d5960bee5115dbcdcb2cecc1a6669ed77 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/cost/tensor_cost.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from functools import reduce + +import paddle +from paddle.fluid.framework import Variable +from paddle.distributed.auto_parallel.dist_tensor import DistributedTensor + +from .base_cost import Cost + + +class TensorCost: + def __init__(self, tensor=None, dist_tensor=None, shape=None, dtype=None): + self._check_args(tensor, dist_tensor, shape, dtype) + self._tensor = tensor + self._dist_tensor = dist_tensor + self._shape = shape + self._dtype = dtype + self._cost = self.calc_cost() + + @property + def tensor(self): + return self._tensor + + @property + def dist_tensor(self): + return self._dist_tensor + + @property + def shape(self): + return self._shape + + @property + def dtype(self): + return self._dtype + + def _check_args(self, tensor, dist_tensor, shape, dtype): + if tensor is not None: + assert (shape is None and dist_tensor is None and dtype is None) + + if not isinstance(tensor, Variable): + raise TypeError( + "Please check tensor type is Variable, but got {}".format( + type(tensor))) + + elif dist_tensor is not None: + assert (tensor is None and shape is None) + if not isinstance(dist_tensor, DistributedTensor): + raise TypeError( + "Please check dist_tensor type is DistributedTensor, but got {}". + format(type(dist_tensor))) + + elif shape is not None: + assert (tensor is None and dist_tensor is None and + dtype is not None) + if not isinstance(shape, (list, set)): + raise TypeError( + "Please check shape type is list or set, but got {}".format( + type(shape))) + + elif dtype is not None: + assert (tensor is None and dist_tensor is None and + shape is not None) + + @property + def cost(self): + return self._cost + + def calc_cost(self): + dtype = None + shape = None + + if self.dist_tensor: + shape = self.dist_tensor.local_sizes() + dtype = self.dist_tensor.serial_tensor.dtype + elif self.tensor: + shape = self.tensor.shape + dtype = self.tensor.dtype + elif self.shape and self.dtype: + shape = self.shape + dtype = self.dtype + + total_count = reduce(lambda x, y: x * y, shape) + + if dtype == paddle.float32 or dtype == paddle.int32: + dtype_factor = 4 + elif node.dtype == paddle.int64: + dtype_factor = 8 + elif node.dtype == paddle.uint8: + dtype_factor = 1 + else: + dtype_factor = 2 + + memory = total_count * dtype_factor + assert memory >= 0 + cost = Cost(memory=memory) + + return cost diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 0d985a523251754ff4335d76cd4ced7ef3f42f49..c5a9df50589ccd36bbd228822da7c29094ad9b1e 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -242,7 +242,8 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra elastic_group.add_argument( "--force", type=bool, default=False, help="update np force") - return parser.parse_args() + known_args, _ = parser.parse_known_args() + return known_args def get_cluster_from_args(args, device_mode, devices_per_proc): diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py index 510f49d8246f128c896712e9e0ad0776fa6f7626..e03d832767e6fac85c242d6563da363f7cbdd4a3 100644 --- a/python/paddle/distributed/launch/context/__init__.py +++ b/python/paddle/distributed/launch/context/__init__.py @@ -25,12 +25,13 @@ class Context(object): def __init__(self, enable_plugin=True): self.args, self.unknown_args = parse_args() self.envs = fetch_envs() - self.logger = self.get_logger() + + self.set_env_in_args() self.node = Node() self.status = Status() - self.set_env_in_args() + self.logger = self.get_logger() # design for event queue, later self.events = [] diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py index 9163e7abd918371ddf4eca388bc912b630684f1f..c2f6896ab6c045da23a142b3ba5a6511c1d9b6ed 100644 --- a/python/paddle/distributed/launch/context/device.py +++ b/python/paddle/distributed/launch/context/device.py @@ -57,7 +57,7 @@ class Device(object): else: self._labels = [] - def get_selected_flag_key(self): + def get_selected_device_key(self): if self._dtype == DeviceType.CPU: return 'FLAGS_selected_cpus' if self._dtype == DeviceType.GPU: @@ -70,19 +70,15 @@ class Device(object): return 'FLAGS_selected_mlus' return 'FLAGS_selected_devices' - def get_selected_flag_label(self, idx): - if idx < len(self._labels): - return self._labels[idx] + def get_selected_devices(self, devices=''): + ''' + return the device label/id relative to the visible devices + ''' + if not devices: + return [str(x) for x in range(0, len(self._labels))] else: - return '0' - - def selected_flags(self, idx=None): - if idx is None: - return {self.get_selected_flag_key(): ','.join(self._labels)} - else: - return { - self.get_selected_flag_key(): self.get_selected_flag_label(idx) - } + devs = [x.strip() for x in devices.split(',')] + return [str(self._labels.index(d)) for d in devs] @classmethod def parse_device(self): diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py index 0a6c1c4002abb3d291c47748eddad201fc0d2839..bbcb7c81d6e65c2e570ad3234619d95d9d7fdb20 100644 --- a/python/paddle/distributed/launch/controllers/collective.py +++ b/python/paddle/distributed/launch/controllers/collective.py @@ -75,6 +75,9 @@ class CollectiveController(Controller): job_endpoints = [i['endpoints'] for i in peer_list] self.pod.reset() + selected_dev_key = self.ctx.node.device.get_selected_device_key() + selected_dev_list = self.ctx.node.device.get_selected_devices( + self.ctx.args.devices) for i in range(self.pod.replicas): e = { "PADDLE_MASTER": collective_master, @@ -90,9 +93,9 @@ class CollectiveController(Controller): "PADDLE_RANK_IN_NODE": str(i), } if self.pod.replicas == 1: - e.update(self.ctx.node.device.selected_flags()) + e.update({selected_dev_key: selected_dev_list}) else: - e.update(self.ctx.node.device.selected_flags(i)) + e.update({selected_dev_key: selected_dev_list[i]}) self.add_container(envs=e, log_tag=i) return True diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py index 08345a2a1f76b84cfde96667e6329bc1b28c18d4..fbe9df4c9a22398df2343cff6b8091506c159f2f 100644 --- a/python/paddle/distributed/launch/controllers/controller.py +++ b/python/paddle/distributed/launch/controllers/controller.py @@ -210,6 +210,8 @@ class Controller(ControllerBase): if self.ctx.args.nproc_per_node: return int(self.ctx.args.nproc_per_node) + elif self.ctx.args.devices: + return len(self.ctx.args.devices.split(',')) else: return self.ctx.node.device.count diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py index 1862f75a77f65d39715e031b0ba72ebea6ab5523..35a44ed942c204a3793a7e49fde915e98743ce27 100644 --- a/python/paddle/distributed/launch/plugins/__init__.py +++ b/python/paddle/distributed/launch/plugins/__init__.py @@ -29,8 +29,9 @@ def process_args(ctx): #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus argdev = ctx.args.devices if argdev: - ctx.node.device.labels = argdev.split(',') - ctx.logger.debug('Device reset by args {}'.format(argdev)) + for d in argdev.split(','): + assert d in ctx.node.device.labels, 'Device not found {}'.format( + argdev) def collective_compatible(ctx): diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py index 310358436ae32384b6ac651034f7c020aa4fe6ef..7668dff36207ed700f5aa6378cb0f5532cfedd3f 100644 --- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py +++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py @@ -22,6 +22,10 @@ from paddle.fluid.framework import program_guard, device_guard from paddle.fluid import unique_name, layers from paddle.fluid.clip import append_gradient_clip_ops from .pass_base import PassBase, PassType, register_pass +from paddle.distributed.auto_parallel.utils import set_var_dist_attr +from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping +from paddle.distributed.auto_parallel.process_group import get_world_process_group +world_process_group = get_world_process_group() def _is_the_backward_op(op): @@ -68,15 +72,11 @@ def _remove_and_get_optimizer_op(main_program, dist_context): def _remove_op_role_var(param, grad): op_maker = core.op_proto_and_checker_maker op = grad.op - assert _is_the_backward_op(op), \ - 'grad.op={} is not the backward op which produces the grad={}' \ - .format(op, grad.name) - if op.has_attr(op_maker.kOpRoleVarAttrName()): op._remove_attr(op_maker.kOpRoleVarAttrName()) -def _get_gm_cond_var(main_program, k_steps): +def _get_gm_cond_var(main_program, k_steps, dist_context): main_block = main_program.global_block() # Add const var k_step_var = layers.create_global_var( @@ -86,6 +86,7 @@ def _get_gm_cond_var(main_program, k_steps): dtype='int32', persistable=True, force_cpu=True) + set_var_dist_attr(dist_context, k_step_var, [-1], world_process_group.ranks) zero_var = layers.create_global_var( name="gradient_merge_zero", @@ -94,6 +95,7 @@ def _get_gm_cond_var(main_program, k_steps): dtype='int32', persistable=True, force_cpu=True) + set_var_dist_attr(dist_context, zero_var, [-1], world_process_group.ranks) # Add step var & cond var step_var = layers.create_global_var( @@ -103,6 +105,7 @@ def _get_gm_cond_var(main_program, k_steps): dtype='int32', persistable=True, force_cpu=True) + set_var_dist_attr(dist_context, step_var, [-1], world_process_group.ranks) cond_var = layers.create_global_var( name="gradient_merge_cond", @@ -111,24 +114,29 @@ def _get_gm_cond_var(main_program, k_steps): dtype='bool', persistable=False, force_cpu=True) + set_var_dist_attr(dist_context, cond_var, [-1], world_process_group.ranks) with device_guard("cpu"): # step_var = (step_var + 1) % k_step layers.increment(x=step_var, value=1.0, in_place=True) - main_block.append_op( + elementwise_mod_op = main_block.append_op( type='elementwise_mod', inputs={'X': step_var, 'Y': k_step_var}, outputs={'Out': step_var}, attrs={'axis': -1, 'use_mkldnn': False}) + naive_set_dist_op_attr_for_program_by_mesh_and_mapping( + elementwise_mod_op, world_process_group.ranks, [-1], dist_context) # cond_var = (step_var == 0) - main_block.append_op( + equal_op = main_block.append_op( type='equal', inputs={'X': step_var, 'Y': zero_var}, outputs={'Out': cond_var}) + naive_set_dist_op_attr_for_program_by_mesh_and_mapping( + equal_op, world_process_group.ranks, [-1], dist_context) return cond_var @@ -137,7 +145,8 @@ def _append_gradient_merge_backward_op( main_program, startup_program, params_grads: List[Tuple[Any, Any]], - cond_var_name: str) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]: + cond_var_name: str, + dist_context) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]: main_block = main_program.global_block() startup_block = startup_program.global_block() @@ -156,12 +165,19 @@ def _append_gradient_merge_backward_op( param_name = param.name param_var = main_block.var(param_name) assert (param_var is not None) + ref_dist_attr = dist_context.get_tensor_dist_attr_for_program(param_var) + assert ref_dist_attr is not None gradient_merge_var = main_block.create_var( name=param_name + "@GRAD@GradientMerge", shape=param_var.shape, dtype=param_var.dtype, persistable=True) param_to_gradient_merge[param_name] = gradient_merge_var + ref_process_mesh = ref_dist_attr.process_mesh + ref_dims_mapping = ref_dist_attr.dims_mapping + + set_var_dist_attr(dist_context, gradient_merge_var, ref_dims_mapping, + ref_process_mesh) startup_gradient_merge_var = startup_block.create_var( name=param_name + "@GRAD@GradientMerge", @@ -186,6 +202,8 @@ def _append_gradient_merge_backward_op( attrs={'axis': -1, 'use_mkldnn': False}) new_params_to_grads.append([param, gradient_merge_var]) + naive_set_dist_op_attr_for_program_by_mesh_and_mapping( + new_grad_op, ref_process_mesh, ref_dims_mapping, dist_context) return new_params_to_grads, param_to_gradient_merge @@ -240,7 +258,7 @@ def _create_cond_block_and_update_optimizer( new_op_desc.remove_attr(op_maker.kOpRoleVarAttrName()) # op's update Grad - if new_op_desc.input("Grad"): + if core.grad_var_suffix() in new_op_desc.input_arg_names(): grad_value = new_op_desc.input("Grad")[0] # TODO FIXME(xym) support fp16 grad_merge_value = grad_value + '@GradientMerge' @@ -265,7 +283,7 @@ def _create_cond_block_and_update_optimizer( def parse_program(main_program, startup_program, params_grads, k_steps, avg, dist_context): # 1 create gradient_merge_cond - cond_var = _get_gm_cond_var(main_program, k_steps) + cond_var = _get_gm_cond_var(main_program, k_steps, dist_context) # 2 remove optimizer_op from main_program optimize_ops_desc = _remove_and_get_optimizer_op(main_program, dist_context) @@ -275,7 +293,8 @@ def parse_program(main_program, startup_program, params_grads, k_steps, avg, # 3 append gradient merge backward op to main_program new_params_to_grads, param_to_gradient_merge = _append_gradient_merge_backward_op( - main_program, startup_program, params_grads, cond_var.name) + main_program, startup_program, params_grads, cond_var.name, + dist_context) # 4 create ConditionalBlock and append gradient merge optimizer ops _create_cond_block_and_update_optimizer( diff --git a/python/paddle/fluid/communicator.py b/python/paddle/fluid/communicator.py index 2a4f125eb3635a20f075ea4d20d66bf77c04827b..d12af8ee72389846d250d1b0048983746ad275e0 100644 --- a/python/paddle/fluid/communicator.py +++ b/python/paddle/fluid/communicator.py @@ -97,7 +97,9 @@ class Communicator(object): recv_ctx, proto_txt, unit64_hosts, - scope=global_scope()): + scope=None): + if scope == None: + scope = global_scope() self.communicator_ = core.DistCommunicator(self.mode, proto_txt, unit64_hosts, send_ctx, recv_ctx, scope, self.envs) @@ -191,7 +193,9 @@ class Communicator(object): def pull_dense(self, context): self.communicator_.pull_dense(context) - def push_sparse_param(self, var_name, table_id=-1, scope=global_scope()): + def push_sparse_param(self, var_name, table_id=-1, scope=None): + if scope == None: + scope = global_scope() if not self.is_running(): raise ValueError( "Communicator should init first. Using fleet.init_worker() before push_sparse_param()" diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index c11ebf7f8eae6021829d8b541eaa6917f0e657cb..f8ffdb8fefc4e2ea03373e5036d56cfb85a9778d 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -105,9 +105,8 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''): if not isinstance(expected_type, tuple): expected_type = (expected_type, ) expected_type += (core.VarBase, ) - # TODO(jiabin): uncomment it when we support declarative mode in eager - # if _in_eager_mode(): - # expected_type += (core.eager.Tensor, ) + if core._in_eager_mode(): + expected_type += (core.eager.Tensor, ) elif isinstance(input, core.VarBase): raise TypeError( "Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. " diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index a730d21afa57980538841a3ad7fe874fd2343d4a..c16936db5a3340816709a7789d1b2fc7cd26b2db 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -17,4 +17,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS}) py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS}) + py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS}) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py new file mode 100644 index 0000000000000000000000000000000000000000..0cd3041ea4d25432a6bb22b04e17d17692dbd007 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py @@ -0,0 +1,75 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.distributed.auto_parallel.cost as cost_model +from paddle.distributed.auto_parallel.cost.base_cost import parse_to_desc +from paddle.distributed.auto_parallel.cost.base_cost import parse_desc_to_str +from paddle.distributed.auto_parallel.cost.base_cost import calc_time_from_model + +paddle.enable_static() + + +def check_cost(cost): + if cost.memory >= 0 and cost.flops >= 0 and cost.time >= 0: + return True + return False + + +class TestCost(unittest.TestCase): + def test_base_cost(self): + cost = cost_model.Cost(memory=100, flops=200, time=0.5) + self.assertTrue(check_cost(cost)) + + def test_comp_cost(self): + x = paddle.static.data(name="x", shape=[20, 20], dtype='float32') + y = paddle.static.data(name="y", shape=[20, 20], dtype='float32') + + z = paddle.matmul(x, y) + matmul_v2_op = None + ops = paddle.static.default_main_program().global_block().ops + for op in ops: + if op.type == "matmul_v2": + matmul_v2_op = op + break + matmul_v2_cost = cost_model.OP_COST_FACTORY["matmul_v2"]( + op=matmul_v2_op) + desc = parse_to_desc(op=matmul_v2_op) + desc_str = parse_desc_to_str(desc) + self.assertIsNotNone(desc_str) + self.assertTrue(check_cost(matmul_v2_cost.cost)) + time = calc_time_from_model(op=matmul_v2_op) + self.assertEqual(time, matmul_v2_cost.cost.time) + tensor_cost = cost_model.TensorCost(tensor=x) + # check memory + self.assertEqual(tensor_cost.cost.memory, 1600) + + def test_comm_cost(self): + desc = {} + desc["op"] = "c_allreduce_sum" + desc["inputs"] = {"X": [([100, 200], paddle.float32)]} + allreduce_cost = cost_model.OP_COST_FACTORY["c_allreduce_sum"]( + op_desc=desc) + self.assertTrue(check_cost(allreduce_cost.cost)) + + def test_cost_estimator(self): + train_program = paddle.static.Program() + cost_estimator = cost_model.CostEstimator(train_program) + self.assertIsNotNone(cost_estimator) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py index acb67e8a20c8c0cb81b473fecc442f3044a6a0b3..0c324ba8ee9aa2d16fadfd68e8b19e9e9a3a9abf 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py @@ -31,6 +31,7 @@ from paddle.fluid.initializer import NumpyArrayInitializer from paddle.distributed.passes import new_pass, PassManager, PassContext import paddle.distributed.fleet as fleet from dist_pass_test_base import DistPassTestBase +from paddle.distributed.auto_parallel.dist_context import DistributedContext logging.getLogger().setLevel(logging.INFO) paddle.enable_static() @@ -111,14 +112,20 @@ class TestGradientMergePass(DistPassTestBase): def init(self): self._params_grads = None self._config = {"k_steps": 4, "avg": True} + #self._config["dist_context"] = DistributedContext() def apply_passes(self, main_prog, startup_prog): - self._config["params_grads"] = self._params_grads - pass_context = PassContext() - auto_parallel_gradient_merge_pass = new_pass( - "auto_parallel_gradient_merge_pass", self._config) - auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog], - pass_context) + #self._config["params_grads"] = self._params_grads + #pass_context = PassContext() + #auto_parallel_gradient_merge_pass = new_pass( + # "auto_parallel_gradient_merge_pass", self._config) + #auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog], + # pass_context) + dist_strategy = fleet.DistributedStrategy() + dist_strategy.gradient_merge = True + dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} + dist_strategy.semi_auto = True + fleet.init(is_collective=True, strategy=dist_strategy) def test_result(self): no_pass_rets = self._distributed_launch( @@ -135,7 +142,7 @@ class TestGradientMergePass(DistPassTestBase): gradient_merge=True, batch_size=8, max_step=8) - + """ # avg loss for gradient_merge pass avg_loss = 0 pass_avg_ret_list = [] @@ -156,6 +163,7 @@ class TestGradientMergePass(DistPassTestBase): rtol=self.rtol, atol=self.atol, equal_nan=self.equal_nan)) + """ def get_model(self, place, gradient_merge, batch_size, max_step): paddle.seed(2021) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py index 171685e4a40f70190d43101d2fe219298a7190b9..4062a460298345404ce1f8dfbaab5f041fa2a3e3 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py @@ -20,6 +20,7 @@ import unittest import paddle from paddle.fluid.dygraph.jit import declarative from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator +import paddle.fluid.core as core from ifelse_simple_func import * @@ -379,7 +380,7 @@ class TestDy2StIfElseRetInt1(unittest.TestCase): return out def test_ast_to_func(self): - self.assertIsInstance(self.out[0], paddle.Tensor) + self.assertIsInstance(self.out[0], (paddle.Tensor, core.eager.Tensor)) self.assertIsInstance(self.out[1], int) @@ -390,8 +391,8 @@ class TestDy2StIfElseRetInt2(TestDy2StIfElseRetInt1): self.out = self.get_dy2stat_out() def test_ast_to_func(self): - self.assertIsInstance(self.out[0], paddle.Tensor) - self.assertIsInstance(self.out[1], paddle.Tensor) + self.assertIsInstance(self.out[0], (paddle.Tensor, core.eager.Tensor)) + self.assertIsInstance(self.out[1], (paddle.Tensor, core.eager.Tensor)) class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1): @@ -401,7 +402,7 @@ class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1): self.out = self.get_dy2stat_out() def test_ast_to_func(self): - self.assertIsInstance(self.out, paddle.Tensor) + self.assertIsInstance(self.out, (paddle.Tensor, core.eager.Tensor)) class TestDy2StIfElseRetInt4(TestDy2StIfElseRetInt1): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py index 220347909f978c11a02e1faa28deb185a399f9f9..427e4c22524519e06ba106a59e0173b021dde176 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py @@ -118,7 +118,8 @@ class TestWithNestedOutput(unittest.TestCase): self.assertTrue(len(dygraph_res) == len(static_res)) for dy_var, st_var in zip(dygraph_res, static_res): - if isinstance(dy_var, fluid.core.VarBase): + if isinstance(dy_var, + (fluid.core.VarBase, fluid.core.eager.Tensor)): self.assertTrue(np.allclose(dy_var.numpy(), st_var.numpy())) else: self.assertTrue(dy_var, st_var) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py index 7ab60082c37d0a4b632def30bfe0e79163371259..507133aba98e2ffc14c157be774ca1d4e2527a9e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py @@ -218,7 +218,7 @@ class TestReturnBase(unittest.TestCase): res = self.dygraph_func(self.input) if isinstance(res, (tuple, list)): return tuple(r.numpy() for r in res) - elif isinstance(res, core.VarBase): + elif isinstance(res, (core.VarBase, core.eager.Tensor)): return res.numpy() return res diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 328301d4be8e9410375c7ec937e78aa3e0e9bb06..3f6f3cf9d06e086b8ccec5109eee213d5cab0670 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -713,44 +713,76 @@ class OpTest(unittest.TestCase): def is_empty(a): return isinstance(a, Empty) - def get_default(idx, all_params_number, defaults): - related_idx = idx - all_params_number + len(defaults) - assert related_idx >= 0, "%d-th arguments don't have default value" % idx - return defaults[related_idx] - - def filter_by_name(x): - names = set(['name', 'out', 'output']) - if isinstance(x, list): return [i for i in x if i not in names] - if isinstance(x, dict): - return {k: v for k, v in x.items() if k not in names} - assert False, "Only support list or dict." + def get_default(idx, defaults): + assert not isinstance( + defaults[idx], Empty + ), "%d-th params of python api don't have default value." % idx + return defaults[idx] def to_defaults_list(params, defaults): return [defaults[p] for p in params if p in defaults] - # NOTE(xiongkun): why don't use input arguments dicts ? - # Because we don't know the python api name of each arguments. - # using parse_arg_and_kwargs, we can get the all api information we need. - api_params, api_defaults = [ - filter_by_name(item) for item in parse_arg_and_kwargs(api) - ] + def parse_attri_value(name, op_inputs, op_attrs): + """ parse true value from inputs and attrs, if there is no name passed by OpTest, return Empty + 1. if the name in op_attrs, use the op_attrs[name] + 2. if the name in op_inputs, convert the op_inputs to [type of default value] + 3. if the name not in op_attrs ans op_inputs, return Empty. (this will use the default value from python api) + """ + if name in op_proto_attrs: + return op_proto_attrs[name] + elif name in op_inputs: + assert op_inputs[name].__len__( + ) == 1, "currently don't support multi-input in attribute." + # why don't use numpy().item() : if the Tensor is float64, we will change it to python.float32, where we loss accuracy: [allclose_op] + # why we reconstruct a tensor: because we want the tensor in cpu. + return paddle.to_tensor( + op_inputs[name][0].numpy(), place='cpu') + else: + return Empty() + + # NOTE(xiongkun): the logic of constructing parameters: + # for example: + # python api: cumprod(x, dim, dtype=None, name=None) + # kernel sig: [["x"], ["dim"], ["out"]]" + # + # we will construct a lot of list with the same length : len == len(api_params), here is 4 + # api_params = ["x", "dim", "dtype", "name"] + # api_defaults = [Empty, Empty, None, None]; empty means no defaults. + # inputs_and_attrs = ["x", "dim"] , the length may shorter or longer than api_params + # input_arguments = [RealValue in self.inputs and self.attrs] + # then ,we will loop for the api_params, construct a result list: + # if the name in ['name', 'dtype', 'out', 'output'], we will use the default value + # else, we will consume a input_arguments. (because the name is not corresponding, so we only use the order) + + api_params, api_defaults = parse_arg_and_kwargs(api) api_defaults = to_defaults_list(api_params, api_defaults) + api_defaults = [ + Empty() for i in range(len(api_params) - len(api_defaults)) + ] + api_defaults + assert len(api_defaults) == len( + api_params), "Error happens. contack xiongkun03 to solve." inputs_sig, attrs_sig, outputs_sig = kernel_sig inputs_and_attrs = inputs_sig + attrs_sig - assert ( - len(api_params) == len(inputs_and_attrs) - ), "inputs and attrs length must equals to python api length. (May be output is in argument list?)" input_arguments = [op_proto_ins[name] for name in inputs_sig] + [ - op_proto_attrs[name] if name in op_proto_attrs else Empty() + parse_attri_value(name, op_proto_ins, op_proto_attrs) for name in attrs_sig ] results = [] - for idx, arg in enumerate(input_arguments): - if is_empty(arg): - results.append( - get_default(idx, len(input_arguments), api_defaults)) + api_ignore_param_list = set(['name', 'dtype', 'out', 'output']) + idx_of_op_proto_arguments = 0 + for idx, arg_name in enumerate(api_params): + if arg_name in api_ignore_param_list: + results.append(get_default(idx, api_defaults)) else: - results.append(arg) + assert idx_of_op_proto_arguments < len( + input_arguments), "Assert False." + tmp = input_arguments[idx_of_op_proto_arguments] + idx_of_op_proto_arguments += 1 + if isinstance(tmp, Empty): + results.append(get_default(idx, api_defaults)) + else: + results.append(tmp) + assert len(results) == len(api_params) return results def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index ce771a572e2c19a0089325e95e28507ba49683a1..b985834773d49381fece56158d8e10834b998a08 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -251,6 +251,9 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace())) self.assertTrue(np.array_equal(egr_tensor12.numpy(), x)) + egr_tensor13 = paddle.randn([2, 2]) + self.assertTrue("eager_tmp" in egr_tensor13.name) + with self.assertRaisesRegexp( ValueError, "The shape of Parameter should not be None"): eager_param = EagerParamBase(shape=None, dtype="float32") diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py index a2f12fbf5809ba9f026b4160754e850f96182df6..365d3f931c27c180eebd9d3b72c80dac5f9227e5 100644 --- a/python/paddle/fluid/tests/unittests/test_run.py +++ b/python/paddle/fluid/tests/unittests/test_run.py @@ -64,7 +64,10 @@ class Collective_Test(unittest.TestCase): if args: cmd.extend(args.split(" ")) cmd.extend([pyname]) - proc = subprocess.Popen(cmd, env) + env = os.environ.copy() + # virtual devies for testing + env.update({'CUDA_VISIBLE_DEVICES': '0,1,2,3,4,5,6,7'}) + proc = subprocess.Popen(cmd, env=env) return proc def test_collective_1(self): diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index 8284771920e81db10d22f08cc96ecc58c422833d..010c049c16be5287dce98f2be69eb8a3a7f7dd22 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -17,25 +17,53 @@ import unittest import numpy as np import paddle from paddle import _C_ops +from paddle.fluid import core from paddle.fluid.framework import _test_eager_guard class TestSparseUtils(unittest.TestCase): + def test_create_sparse_coo_tensor(self): + with _test_eager_guard(): + non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] + non_zero_elements = [1, 2, 3, 4, 5] + dense_shape = [3, 4] + dense_indices = paddle.to_tensor(non_zero_indices) + dense_elements = paddle.to_tensor( + non_zero_elements, dtype='float32') + stop_gradient = False + coo = core.eager.sparse_coo_tensor(dense_indices, dense_elements, + dense_shape, stop_gradient) + print(coo) + + def test_create_sparse_csr_tensor(self): + with _test_eager_guard(): + non_zero_crows = [0, 2, 3, 5] + non_zero_cols = [1, 3, 2, 0, 1] + non_zero_elements = [1, 2, 3, 4, 5] + dense_shape = [3, 4] + dense_crows = paddle.to_tensor(non_zero_crows) + dense_cols = paddle.to_tensor(non_zero_cols) + dense_elements = paddle.to_tensor( + non_zero_elements, dtype='float32') + stop_gradient = False + csr = core.eager.sparse_csr_tensor(dense_crows, dense_cols, + dense_elements, dense_shape, + stop_gradient) + print(csr) + def test_to_sparse_coo(self): with _test_eager_guard(): x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]] non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] non_zero_elements = [1, 2, 3, 4, 5] dense_x = paddle.to_tensor(x) - #TODO(zhangkaihuo): change to test the corresponding API - out = _C_ops.final_state_to_sparse_coo(dense_x, 2) - print(out) + out = dense_x.to_sparse_coo(2) assert np.array_equal(out.non_zero_indices().numpy(), non_zero_indices) assert np.array_equal(out.non_zero_elements().numpy(), non_zero_elements) - dense_tensor = _C_ops.final_state_to_dense(out) + dense_tensor = out.to_dense() assert np.array_equal(dense_tensor.numpy(), x) def test_to_sparse_csr(self): @@ -45,14 +73,14 @@ class TestSparseUtils(unittest.TestCase): non_zero_cols = [1, 3, 2, 0, 1] non_zero_elements = [1, 2, 3, 4, 5] dense_x = paddle.to_tensor(x) - out = _C_ops.final_state_to_sparse_csr(dense_x) + out = dense_x.to_sparse_csr() print(out) assert np.array_equal(out.non_zero_crows().numpy(), non_zero_crows) assert np.array_equal(out.non_zero_cols().numpy(), non_zero_cols) assert np.array_equal(out.non_zero_elements().numpy(), non_zero_elements) - dense_tensor = _C_ops.final_state_to_dense(out) + dense_tensor = out.to_dense() assert np.array_equal(dense_tensor.numpy(), x) diff --git a/python/setup.py.in b/python/setup.py.in index 2dbefb20bb6e63c9457d67346aa43ae9d67df07e..7c1232c1d413f3006cc17768a452d23e56fdb415 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -307,6 +307,7 @@ packages=['paddle', 'paddle.distributed.auto_parallel', 'paddle.distributed.auto_parallel.operators', 'paddle.distributed.auto_parallel.tuner', + 'paddle.distributed.auto_parallel.cost', 'paddle.distributed.passes', 'paddle.framework', 'paddle.jit', diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index f632c9a9dba504d209946e494e55eb970e727629..bfe1e7e88bec4ea806bb6fbd7cd55af54d642a50 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -22,7 +22,9 @@ attr_type_converter = { "i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr', - "f": 'F32Attr' + "f": 'F32Attr', + "NSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE": 'StrAttr', + "St6vectorIiSaIiEE": 'I32ArrayAttr' } target_type_converter = {"CPU": "CPU", "GPU": "GPU"} diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh index febfe5d04762a43da0710b34e21252ffdf4611ea..612620979674934ff8aa70abdf4967200f20b492 100644 --- a/tools/infrt/get_phi_kernel_function.sh +++ b/tools/infrt/get_phi_kernel_function.sh @@ -38,35 +38,36 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \ --wrapped_infermeta_header_path ${temp_path}/generate.h \ --wrapped_infermeta_source_path ${temp_path}/generate.cc -grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc \ +find ${PADDLE_ROOT}/paddle/phi/ -name "*.cc" | xargs grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc \ | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt - #step 3:get ir's attr_name. ir_attr_name_info_file=`mktemp` # phi_cpu attr -all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` +all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` for ir in $all_ir_name do - attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \ + attr_name=`grep "<\"$ir" -A 3 ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \ | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \ gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ + gsub(/I32ArrayAttr/,"");gsub(/SI32ArrayAttr/,""); \ gsub(/Attr/,"");gsub(/\)/,""); \ gsub(/[,:]/,"");print $a}'` echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file done # phi_gpu attr -all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` +all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` for ir in $all_ir_name do - attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \ + attr_name=`grep "<\"$ir" -A 3 ${PADDLE_ROOT}/paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \ | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \ gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ - gsub(/Attr/,"");gsub(/\)/,""); \ + gsub(/I32ArrayAttr/,"");gsub(/SI32ArrayAttr/,""); \ + gsub(/Attr/,"");gsub(/\)/,"") \ gsub(/[,:]/,"");print $a}'` echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file done diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index 8b752f928719bcc7ebef4792c29af02261dbd551..db2e56ca328171e1076f14611a637cfb1cc9644d 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -58,7 +58,7 @@ def get_api_yaml_info(file_path): def get_kernel_info(file_path): f = open(file_path, "r") cont = f.readlines() - return [l.strip() for l in cont] + return [l.strip() for l in cont if l.strip() != ""] def get_attr_info(file_path): @@ -91,11 +91,10 @@ def merge(infer_meta_data, kernel_data, wrap_data): full_kernel_data = [] for l in kernel_data: key = l.split()[0] - if key in meta_map: - if key in meta_map: - full_kernel_data.append((l + " " + wrap_map[key]).split()) - else: - full_kernel_data.append((l + " " + meta_map[key]).split()) + if key in wrap_map: + full_kernel_data.append((l + " " + wrap_map[key]).split()) + elif key in meta_map: + full_kernel_data.append((l + " " + meta_map[key]).split()) else: full_kernel_data.append((l + " unknown").split()) @@ -246,15 +245,10 @@ def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]): registry->AddKernelWithAttrs("{ir_name}",""" res += f""" - std::bind(&KernelLauncherFunc, - KernelLauncher(), - std::placeholders::_1), {{{attr_names}}}); """ @@ -263,15 +257,10 @@ registry->AddKernelWithAttrs("{ir_name}",""" registry->AddKernel("{ir_name}",""" res += f""" - std::bind(&KernelLauncherFunc, - KernelLauncher(), - std::placeholders::_1)); + {infer_shape_func}>); """ return res diff --git a/tools/infrt/print_kernel_pass_info.py b/tools/infrt/print_kernel_pass_info.py new file mode 100644 index 0000000000000000000000000000000000000000..c2f3e36a675b19b468132d1deb784103207f46bb --- /dev/null +++ b/tools/infrt/print_kernel_pass_info.py @@ -0,0 +1,121 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import json + +skip_list = [] + + +def remove_grad_kernel(kernels): + clean_kernels = [] + for kernel_ in kernels: + if (not "_grad" in kernel_): + clean_kernels.append(kernel_) + return clean_kernels + + +CPU_KERNEL_REGISTER = "REGISTER_OP_CPU_KERNEL(" +GPU_KERNEL_REGISTER = "REGISTER_OP_CUDA_KERNEL(" +XPU_KERNEL_REGISTER = "REGISTER_OP_XPU_KERNEL(" + + +def get_compat_kernels_info(register): + kernels_info = {} + kernel_names = [] + for dirpath, dirnames, filenames in os.walk("../../paddle/fluid/operators"): + for file_name in filenames: + if not ".cc" in file_name: + continue + with open(os.path.join(dirpath, file_name)) as f: + txt = f.readlines() + content = "" + registry = False + is_macro_defination = False + for line in txt: + if line.strip().startswith("#define") and line.strip( + ).endswith("\\"): + is_macro_defination = True + continue + if is_macro_defination: + if not line.strip().endswith("\\"): + is_macro_defination = False + continue + + if (register in line): + content = "" + registry = True + if (registry): + content += line + if (registry and ";" in line): + kernel_name = content.replace("\n", "").replace( + " ", "").strip(register).split(",") + registry = False + kernel_names.append(kernel_name[0]) + return remove_grad_kernel(kernel_names) + + +def show_kernel_statistics(backend, kernels): + print("=== kernels statistics === ") + print("the number of " + backend + " kernels is: " + str(len(kernels)) + + "\n") + print(kernels) + print("\n") + + +def show_pass_statistics(backend, passes): + print("=== Passes Statistics === ") + print("The number of " + backend + " passes is: " + str(len(passes)) + "\n") + print(passes) + print("\n") + + +def get_passes_info(register): + pass_registry_func = "" + with open("../../paddle/fluid/inference/api/paddle_pass_builder.cc") as f: + txt = f.readlines() + stack = [] + registry_fun_found = False + for line in txt: + if line.strip().startswith("//"): + continue + if register in line: + registry_fun_found = True + if (registry_fun_found): + pass_registry_func += line + if registry_fun_found: + for char in line: + if char == "{": + stack.append(char) + if char == "}": + stack.pop() + if len(stack) == 0: + registry_fun_found = False + pass_list = re.findall("\"(.+?)_pass\"", pass_registry_func) + return pass_list + + +if __name__ == "__main__": + cpu_kernels = get_compat_kernels_info(CPU_KERNEL_REGISTER) + gpu_kernels = get_compat_kernels_info(GPU_KERNEL_REGISTER) + xpu_kernels = get_compat_kernels_info(XPU_KERNEL_REGISTER) + show_kernel_statistics("CPU", cpu_kernels) + show_kernel_statistics("GPU", gpu_kernels) + show_kernel_statistics("XPU", xpu_kernels) + + cpu_passes = get_passes_info("CpuPassStrategy::CpuPassStrategy()") + gpu_passes = get_passes_info("GpuPassStrategy::GpuPassStrategy()") + show_pass_statistics("CPU", cpu_passes) + show_pass_statistics("GPU", gpu_passes)