diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 415c0fe9bef9eab89e670d8b3f6f7c330b316ed8..45a76fdc1f1a2aab66e7f4972eecbbec03af941a 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index f7c17bd7cfe7e099e0afeaf623724e12387aff44..51ed537ce5db1cad1ea7b6d1921855c1c378e641 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME) else() xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS}) find_fluid_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) endif() if (xpu_library_DEPS) add_dependencies(${TARGET_NAME} ${xpu_library_DEPS}) diff --git a/cmake/phi.cmake b/cmake/phi.cmake index d9132b84455e7309713b99f9e574bfceb83c7b6c..f6e15758379ada165a9dc0e31273a533b06ad2df 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST) file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") elseif (${kernel_path} MATCHES "./gpudnn\/") file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n") + elseif (${kernel_path} MATCHES "./kps\/") + file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n") else () # deal with device independent kernel, now we use CPU temporaary file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") @@ -97,6 +99,7 @@ function(kernel_library TARGET) set(gpu_srcs) set(xpu_srcs) set(gpudnn_srcs) + set(kps_srcs) set(selected_rows_srcs) # parse and save the deps kerenl targets set(all_srcs) @@ -128,6 +131,9 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) endif() @@ -137,6 +143,15 @@ function(kernel_library TARGET) list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) endif() endif() + if (WITH_XPU_KP) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + # Change XPU2 file suffix + # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu + file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + endif() + endif() else() # TODO(chenweihang): impl compile by source later endif() @@ -150,6 +165,7 @@ function(kernel_library TARGET) list(APPEND all_srcs ${gpu_srcs}) list(APPEND all_srcs ${xpu_srcs}) list(APPEND all_srcs ${gpudnn_srcs}) + list(APPEND all_srcs ${kps_srcs}) foreach(src ${all_srcs}) file(READ ${src} target_content) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) @@ -159,11 +175,11 @@ function(kernel_library TARGET) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) endif() foreach(include_kernel ${include_kernels}) - if ("${kernel_library_SUB_DIR}" STREQUAL "") - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) - else() - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) - endif() + if ("${kernel_library_SUB_DIR}" STREQUAL "") + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) + else() + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) + endif() string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) list(APPEND kernel_deps ${kernel_name}) endforeach() @@ -176,11 +192,20 @@ function(kernel_library TARGET) list(LENGTH gpu_srcs gpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len) list(LENGTH gpudnn_srcs gpudnn_srcs_len) + list(LENGTH kps_srcs kps_srcs_len) list(LENGTH selected_rows_srcs selected_rows_srcs_len) + # kernel source file level + # level 1: base device kernel + # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # level 2: device-independent kernel + # - common_srcs + # level 3: Kernel implemented by reusing device-independent kernel + # - selected_rows_srcs + # Build Target according different src organization if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND + ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. if (WITH_GPU) @@ -193,6 +218,11 @@ function(kernel_library TARGET) hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) endif() + elseif (WITH_XPU_KP) + if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) + endif() else() if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) @@ -200,7 +230,7 @@ function(kernel_library TARGET) endif() endif() # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) + elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) if (WITH_GPU) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) @@ -209,6 +239,10 @@ function(kernel_library TARGET) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() + elseif (WITH_XPU_KP) + if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() else() if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) @@ -222,6 +256,9 @@ function(kernel_library TARGET) elseif (WITH_ROCM) hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + elseif (WITH_XPU_KP) + xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) else() cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) @@ -232,6 +269,8 @@ function(kernel_library TARGET) nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif (WITH_XPU_KP) + xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) else() cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() @@ -240,6 +279,8 @@ function(kernel_library TARGET) nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif (WITH_XPU_KP) + xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) else() cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() @@ -249,7 +290,7 @@ function(kernel_library TARGET) if (${target_build_flag} EQUAL 1) if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR - ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR + ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) # append target into PHI_KERNELS property get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) @@ -275,6 +316,9 @@ function(kernel_library TARGET) if (${gpudnn_srcs_len} GREATER 0) kernel_declare(${gpudnn_srcs}) endif() + if (${kps_srcs_len} GREATER 0) + kernel_declare(${kps_srcs}) + endif() if (${selected_rows_srcs_len} GREATER 0) kernel_declare(${selected_rows_srcs}) endif() diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 41652f8b6ed6f717ad8a571be8e7a16408b34504..a5b40f8aa07d77e803f2cad36155b7de1bd03719 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,4 +1,5 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc new file mode 100644 index 0000000000000000000000000000000000000000..59f3ea3b0a7d85651e7780b4b11875f19b70931e --- /dev/null +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/reducer.h" +#include "paddle/phi/common/data_type.h" + +namespace paddle { +namespace distributed { + +std::vector> Eager_AssignGroupBySize( + const std::vector tensors, + const std::vector &is_sparse_gradient, + const std::vector &group_size_limits, + const std::vector &tensor_indices) { + PADDLE_ENFORCE_EQ( + tensors.size(), is_sparse_gradient.size(), + platform::errors::PreconditionNotMet( + "tensors len must be equal to is_sparse_gradient len, but " + "[%lu] != [%lu]", + tensors.size(), is_sparse_gradient.size())); + auto check_perm = [](const std::vector &x) -> bool { + size_t len = x.size(); + std::vector cnt(len, 0); + for (size_t i = 0; i < len; ++i) { + if (x[i] >= static_cast(len) || x[i] < 0 || cnt[x[i]]) { + return false; + } + cnt[x[i]]++; + } + return true; + }; + + PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices), + platform::errors::PreconditionNotMet( + "tensor_indices must be a permutation from 0 to %lu", + tensor_indices.size())); + // the return vector + std::vector> res; + + // Key: the var type + // Value: should use which index in group_size_limits for group size limit + std::map group_limit_index; + + // Key: the var type + // Value: + std::map, size_t>> + next_group; + + for (size_t i = 0; i < tensors.size(); ++i) { + const auto &var = tensors[i]; + + size_t tensor_real_index = i; + if (!tensor_indices.empty()) { + tensor_real_index = tensor_indices[i]; + } + + if (is_sparse_gradient[tensor_real_index]) { + // we keep sparse var a single group + res.push_back({tensor_real_index}); + continue; + } + + const auto &var_dtype = var.dtype(); + VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype; + auto &group_info = next_group[var_dtype]; + + int64_t var_size = -1; + + if (var.is_dense_tensor()) { + var_size = + std::dynamic_pointer_cast(var.impl())->numel(); + } else { + VLOG(3) << "var " << var.name() + << " is not tensor or selected_rows, so skip it"; + continue; + } + + group_info.first.push_back(tensor_real_index); + group_info.second += experimental::SizeOf(var_dtype) * var_size; + // group_info.second += framework::SizeOfType(var_dtype) * var_size; + + if (group_limit_index.find(var_dtype) == group_limit_index.end()) { + // means it is the first var of var_dtype + group_limit_index[var_dtype] = 0; + } + auto &cur_limit_index = group_limit_index[var_dtype]; + if (group_info.second >= group_size_limits[cur_limit_index]) { + // exceed group capacity and create a new group + res.emplace_back(std::move(group_info.first)); + group_info = std::pair, size_t>(); + cur_limit_index = + (std::min)(cur_limit_index + 1, group_size_limits.size() - 1); + } + } + + // add the final groups + for (auto &e : next_group) { + auto &group_info = e.second; + if (!group_info.first.empty()) { + res.emplace_back(std::move(group_info.first)); + } + } + + for (const auto &group_index : res) { + PADDLE_ENFORCE_NE( + group_index.empty(), true, + platform::errors::PreconditionNotMet( + "AssignGroupBySize construct empty group, please check.")); + } + if (tensor_indices.empty()) { + std::sort(res.begin(), res.end(), + [](const std::vector &x, const std::vector &y) { + return x.front() < y.front(); + }); + } + return res; +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h new file mode 100644 index 0000000000000000000000000000000000000000..f8c75385ef8bd6891df8eda6faa93c73091c37f5 --- /dev/null +++ b/paddle/fluid/distributed/collective/reducer.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/eager/api/utils/tensor_utils.h" + +namespace paddle { +namespace distributed { +using Tensor = paddle::experimental::Tensor; + +std::vector> Eager_AssignGroupBySize( + const std::vector, const std::vector& is_sparse_gradient, + const std::vector& group_size_limits, + const std::vector& tensor_indices = {}); + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index e14b91d935d05c12442f3d0205c1e97df9697ec3..d9287b9a624d39c40cd63071ab08257a8526ce17 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -308,22 +308,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, // TODO(chenweihang): support multiple inputs and outputs later phi::InferMetaContext infer_mete_context; for (auto& in_name : input_names) { - if (ctx->HasInput(in_name)) { - infer_meta_context.EmplaceBackInput(std::make_shared( - ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime())); + if (ctx->HasInputs(in_name)) { + auto input_var = ctx->GetInputVarPtrs(in_name); + if (input_var.size() == 1) { + infer_meta_context.EmplaceBackInput( + std::make_shared(input_var[0], ctx->IsRuntime())); + } else { + paddle::SmallVector> inputs; + inputs.reserve(input_var.size()); + for (const auto& in : input_var) { + inputs.push_back( + std::make_shared(in, ctx->IsRuntime())); + } + infer_meta_context.EmplaceBackInputs(std::move(inputs)); + } } else { infer_meta_context.EmplaceBackInput({nullptr}); } } - for (auto& out_name : output_names) { - if (ctx->HasOutput(out_name)) { - infer_meta_context.EmplaceBackOutput(std::make_shared( - ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime())); - } else { - infer_meta_context.EmplaceBackOutput({nullptr}); - } - } auto attr_reader = ctx->Attrs(); for (size_t i = 0; i < attr_names.size(); ++i) { auto attr_name = attr_names[i]; @@ -348,13 +351,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } } else { // If is not in runtime, we will set default value(-1) for ScalarArray - int64_t num_ele = 0; std::vector vars; vars.reserve(infershape_inputs.size()); - for (size_t i = 0; i < infershape_inputs.size(); i++) { + for (size_t i = 0; i < infershape_inputs.size(); ++i) { vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i])); } + int64_t num_ele = 0; if (vars.size() == 1) { num_ele = 1; const auto& tensor_dims = vars[0]->GetShape(); @@ -362,16 +365,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, num_ele *= tensor_dims[i]; } } else { - for (auto& var : vars) { - const auto& tensor_dims = var->GetShape(); - PADDLE_ENFORCE_EQ(tensor_dims.size(), 1, - platform::errors::InvalidArgument( - "The shape is constructed by multi-tensor, " - "every tensor's dims should be 1. But your " - "shape has tensor that dims is %s.", - tensor_dims.size())); - num_ele += tensor_dims[0]; - } + num_ele = vars.size(); } phi::ScalarArray tensor_attr(std::vector(num_ele, -1)); tensor_attr.SetFromTensor(true); @@ -383,10 +377,14 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr(std::move( phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(int))) { + infer_meta_context.EmplaceBackAttr( + phi::ScalarArray({BOOST_GET_CONST(int, attr)})); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` to ScalarArray when " - "construct KernelContext.", + "construct InferMetaContext.", attr_name)); } } @@ -414,7 +412,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } } else if (ctx->HasInput(attr_name)) { const auto& infershape_input = ctx->GetInputVarPtrs(attr_name); - if (infershape_input.size() == 1) { if (ctx->IsRuntime()) { Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]); @@ -490,6 +487,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, "Unsupported attribute type is received when call " "InferShapeFunctor.")); } + } else { + // do nothing + } + } + + for (auto& out_name : output_names) { + if (ctx->HasOutputs(out_name)) { + auto output_var = ctx->GetOutputVarPtrs(out_name); + if (output_var.size() == 1) { + infer_meta_context.EmplaceBackOutput(std::make_shared( + output_var[0], ctx->IsRuntime())); + } else { + paddle::SmallVector> outputs; + outputs.reserve(output_var.size()); + for (const auto& out : output_var) { + outputs.emplace_back( + std::make_shared(out, ctx->IsRuntime())); + } + infer_meta_context.EmplaceBackOutputs(std::move(outputs)); + } + } else { + infer_meta_context.EmplaceBackOutput({nullptr}); } } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index dad5358590cb1497453681ce940898314a1d06eb..0d53a54ff822ae4dde9fcba7c2559569c7e1bd4f 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -78,7 +78,6 @@ pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) -pass_library(conv_affine_channel_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(identity_scale_op_clean_pass base) pass_library(sync_batch_norm_pass base) diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc deleted file mode 100644 index f28c9988bd858ad00a5c5a532b7b484315557d8f..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc +++ /dev/null @@ -1,420 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h" - -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_version_registry.h" - -namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace framework { -namespace ir { - -class Node; - -#define GET_CONV_BN_NODES(pattern_name) \ - /* OPERATORS */ \ - GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ - GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \ - /* CONV inputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ - /* CONV outputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ - /* Affine Channel inputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \ - GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \ - /* Affine channel outputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */ - -void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, - const ir::Node& ac_scale, - const LoDTensor& ac_bias_tensor, - LoDTensor* eltwise_y_in_tensor) { - using EigenVectorArrayMap = - Eigen::Map>; - using ConstEigenVectorArrayMap = - Eigen::Map>; - using EigenMatrixArrayMap = Eigen::Map< - Eigen::Array>; - - // Re-compute bias of conv2d from AffineChannel - PADDLE_ENFORCE_EQ( - eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(), - platform::errors::InvalidArgument( - "Tensor elementwise y(%d) and activation bias(%d) must have same " - "dimension.", - eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size())); - - auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable(); - - ConstEigenVectorArrayMap scale_array(scale_tensor->data(), - scale_tensor->numel(), 1); - ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data(), - ac_bias_tensor.numel(), 1); - - EigenVectorArrayMap eltwise_y_in_array( - eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), - eltwise_y_in_tensor->numel(), 1); - - eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array; - - // Re-compute weight of conv2d from AffineChannel - auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); - auto weights_shape = weights->dims(); - auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1); - auto* weights_data = weights->mutable_data(platform::CPUPlace()); - - EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0], - weights_shape_2d[1]); - - weights_array_2d.colwise() *= scale_array; - - // Check for subnormal values that slows down convolution execution - for (int i = 0; i < weights->numel(); ++i) { - if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0; - } -} - -ConvAffineChannelFusePass::ConvAffineChannelFusePass() { - AddOpCompat(OpCompat("conv2d")) - .AddInput("Input") - .IsTensor() - .End() - .AddInput("Filter") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddInput("ResidualData") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Output") - .IsTensor() - .End() - .AddAttr("strides") - .IsType>() - .End() - .AddAttr("paddings") - .IsType>() - .End() - .AddAttr("padding_algorithm") - .IsOptional() - .IsStringIn({"EXPLICIT", "SAME", "VALID"}) - .End() - .AddAttr("groups") - .IsNumGE(1) - .End() - .AddAttr("dilations") - .IsType>() - .End() - .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - - AddOpCompat(OpCompat("affine_channel")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Scale") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("data_layout") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - - AddOpCompat(OpCompat("elementwise_add")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("axis") - .IsNumEQ(1) - .End(); -} - -void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL( - graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); - FusePassBase::Init(name_scope_, graph); - - auto* scope = param_scope(); - PADDLE_ENFORCE_NOT_NULL( - scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); - - GraphPatternDetector gpd; - auto* conv_input = - gpd.mutable_pattern() - ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) - ->AsInput() - ->assert_is_op_input("conv2d", "Input"); - patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), - name_scope_); - conv_ac_pattern(conv_input, false /*with_eltwise_add*/); - - int found_conv_ac_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - if (!IsCompat(subgraph, g)) { - LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed."; - return; - } - - VLOG(4) << "handle ConvAffineChannel fuse"; - - GET_CONV_BN_NODES(conv_ac_pattern); - - auto data_format = conv->Op()->GetAttrIfExists("data_format"); - if (data_format == "AnyLayout") { - LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, " - "it's wrong if data_format of conv is not " - "NCHW."; - } - - // Get affine_channel bias for resizing eltwise_y! - auto* ac_bias_tensor = - scope->FindVar(ac_bias->Name())->GetMutable(); - - // Create eltwise_y (conv bias) variable - VarDesc eltwise_y_in_desc( - patterns::PDNodeName(name_scope_, "eltwise_y_in")); - // Set shape && datatype manually - eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims())); - eltwise_y_in_desc.SetDataType( - framework::TransToProtoVarType(ac_bias_tensor->dtype())); - eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel()); - eltwise_y_in_desc.SetPersistable(true); - - // Initialize eltwise_y - auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); - auto* eltwise_y_in_tensor = - scope->Var(eltwise_y_in_node->Name())->GetMutable(); - eltwise_y_in_tensor->Resize(ac_bias_tensor->dims()); - std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), - eltwise_y_in_tensor->numel(), 0.0f); - - // update weights and biases - recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, - eltwise_y_in_tensor); - - // create an elementwise add node. - OpDesc desc; - desc.SetInput("X", std::vector({conv_out->Name()})); - desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); - desc.SetOutput("Out", std::vector({ac_out->Name()})); - desc.SetType("elementwise_add"); - desc.SetAttr("axis", 1); - desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists("use_mkldnn")); - - auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. - - GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel}); - - IR_NODE_LINK_TO(conv_out, eltwise_op); - IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); - IR_NODE_LINK_TO(eltwise_op, ac_out); - found_conv_ac_count++; - }; - - gpd(graph, handler); - - AddStatis(found_conv_ac_count); -} - -ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() { - AddOpCompat(OpCompat("conv2d")) - .AddInput("Input") - .IsTensor() - .End() - .AddInput("Filter") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddInput("ResidualData") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Output") - .IsTensor() - .End() - .AddAttr("strides") - .IsType>() - .End() - .AddAttr("paddings") - .IsType>() - .End() - .AddAttr("padding_algorithm") - .IsOptional() - .IsStringIn({"EXPLICIT", "SAME", "VALID"}) - .End() - .AddAttr("groups") - .IsNumGE(1) - .End() - .AddAttr("dilations") - .IsType>() - .End() - .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - AddOpCompat(OpCompat("affine_channel")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Scale") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("data_layout") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - AddOpCompat(OpCompat("elementwise_add")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("axis") - .IsNumEQ(1) - .End(); -} - -void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL( - graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); - FusePassBase::Init(name_scope_, graph); - - auto* scope = param_scope(); - PADDLE_ENFORCE_NOT_NULL( - scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); - - GraphPatternDetector gpd; - auto* conv_input = - gpd.mutable_pattern() - ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) - ->AsInput() - ->assert_is_op_input("conv2d", "Input"); - patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), - name_scope_); - conv_ac_pattern(conv_input, true /*with_eltwise_add*/); - - int found_conv_ac_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - if (!IsCompat(subgraph, g)) { - LOG(WARNING) - << "ConvEltwiseAddAffineChannelFusePass in op compat failed."; - return; - } - - VLOG(4) << "handle ConvBN fuse"; - - GET_CONV_BN_NODES(conv_ac_pattern); - auto data_format = conv->Op()->GetAttrIfExists("data_format"); - if (data_format == "AnyLayout") { - LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is " - "enabled, it's wrong if data_format of conv " - "is not NCHW."; - } - // OPERATORS - GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern); - // BIAS inputs - GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern); - // BIAS outputs - GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern); - - // Get eltwise_y (conv bias) variable - auto* eltwise_y_in_tensor = - scope->FindVar(eltwise_y_in->Name())->GetMutable(); - - // Get batch norm bias - auto* ac_bias_tensor = - scope->FindVar(ac_bias->Name())->GetMutable(); - - recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, - eltwise_y_in_tensor); - - // Update the elementwise_add node - eltwise->Op()->SetAttr("axis", 1); - eltwise->Op()->SetOutput("Out", std::vector({ac_out->Name()})); - - GraphSafeRemoveNodes(graph, - {ac_scale, ac_bias, affine_channel, eltwise_out}); - - IR_NODE_LINK_TO(eltwise, ac_out); - - found_conv_ac_count++; - }; - - gpd(graph, handler); - AddStatis(found_conv_ac_count); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(conv_affine_channel_fuse_pass, - paddle::framework::ir::ConvAffineChannelFusePass); -REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, - paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); -REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass) - .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination() - .LE("conv2d", 1) - .EQ("affine_channel", 0)); -REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass) - .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination() - .LE("conv2d", 1) - .LE("elementwise_add", 1) - .EQ("affine_channel", 0)); diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h deleted file mode 100644 index 8cfaf5c6a89f06b453dbbc94b5a7fe8b83e5c111..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" - -namespace paddle { -namespace framework { -namespace ir { - -/* - * Fuse the Conv and ConvAffineChannel. - */ -class Graph; - -class ConvAffineChannelFusePass : public FusePassBase { - public: - ConvAffineChannelFusePass(); - virtual ~ConvAffineChannelFusePass() {} - - protected: - void ApplyImpl(ir::Graph*) const override; - const std::string name_scope_{"conv_affine_channel_fuse"}; -}; - -class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { - public: - ConvEltwiseAddAffineChannelFusePass(); - virtual ~ConvEltwiseAddAffineChannelFusePass() {} - - protected: - void ApplyImpl(ir::Graph*) const override; - const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; -}; - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d33791f70c4d2f759bcd4f6443a5a1f244673d4f..b12ad552aba6e6e599689c05c23ae306110aa78f 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext( } pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); } + VLOG(4) << "Done inputs"; for (size_t i = 0; i < output_names.size(); ++i) { auto it = ctx.outputs.find(output_names[i]); @@ -2107,17 +2108,12 @@ void OperatorWithKernel::BuildPhiKernelContext( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } - - experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, - output_defs.at(i)); - SetAllocationForOutputTenosr( - tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend)); - pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } + VLOG(4) << "Done outputs"; for (size_t i = 0; i < attr_names.size(); ++i) { if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { @@ -2226,6 +2222,7 @@ void OperatorWithKernel::BuildPhiKernelContext( } } } + VLOG(4) << "Done attributes"; } } // namespace framework diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 355291beb60f949b52b681592d42b7da4e80186b..93bc2c02d57cb7b57cf48d6f5c34a27a97637377 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) { library_type = LibraryType::kMKLDNN; } else if (kernel_key.backend() == phi::Backend::GPUDNN) { library_type = LibraryType::kCUDNN; + } else if (kernel_key.backend() == phi::Backend::KPS) { + library_type = LibraryType::kKP; } else { // do nothing } @@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey( backend = phi::Backend::MKLDNN; } else if (kernel_type.library_type_ == LibraryType::kCUDNN) { backend = phi::Backend::GPUDNN; + } else if (kernel_type.library_type_ == LibraryType::kKP) { + backend = phi::Backend::KPS; } else { // do } @@ -229,26 +233,5 @@ static void SetAllocationForUninitializedDenseTensor( dense_tensor->ResetHolder(shared_allocation); } -void SetAllocationForOutputTenosr(phi::TensorBase* tensor, - const platform::Place& place) { - if (phi::DenseTensor::classof(tensor)) { - auto* dense_tensor = static_cast(tensor); - if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) { - SetAllocationForUninitializedDenseTensor(dense_tensor, place); - } - } else if (phi::SelectedRows::classof(tensor)) { - auto* selected_rows = static_cast(tensor); - if (!selected_rows->value().IsInitialized() || - !(selected_rows->place() == place)) { - SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(), - place); - } - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported tensor type is received when setting allocation for " - "output tensor.")); - } -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index 1a1f79d82770058ae4010b7a3a3162280ceb1537..a17578816921b2337a76d1a0a69a6c8adbc51c4d 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -62,9 +62,6 @@ class KernelArgsNameMaker { void InitDefaultKernelSignatureMap(); -void SetAllocationForOutputTenosr(phi::TensorBase* tensor, - const platform::Place& place); - // TODO(Wilber): support others device context. template struct ConvertToPhiContext { diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 8e1e2fbe9a12da672a633075ed4c41d3d62cd7e1..3b5762720e7fb4a9eb0be157f6dabf07aa9353c2 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -323,12 +323,6 @@ void BuildDygraphPhiKernelContext( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } - - experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, - output_defs.at(i)); - framework::SetAllocationForOutputTenosr( - tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend)); - kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 313e1f2faea553809cb6fce66ca9a751bace8d75..f5f36d805b43ea0815683e3b65bf157fe5beb2de 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -75,13 +75,11 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::ClearPasses() { passes_.clear(); } const std::vector kTRTSubgraphPasses({ - "conv_affine_channel_fuse_pass", // - "adaptive_pool2d_convert_global_pass", - "conv_eltwiseadd_affine_channel_fuse_pass", // - "shuffle_channel_detect_pass", // - "quant_conv2d_dequant_fuse_pass", // - "delete_quant_dequant_op_pass", // - "delete_quant_dequant_filter_op_pass", // + "adaptive_pool2d_convert_global_pass", + "shuffle_channel_detect_pass", // + "quant_conv2d_dequant_fuse_pass", // + "delete_quant_dequant_op_pass", // + "delete_quant_dequant_filter_op_pass", // // "fc_fuse_pass", // "simplify_with_basic_ops_pass", // "embedding_eltwise_layernorm_fuse_pass", // @@ -134,22 +132,20 @@ const std::vector kLiteSubgraphPasses({ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ // "identity_scale_op_clean_pass", // - "is_test_pass", // - "simplify_with_basic_ops_pass", // - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_eltwiseadd_bn_fuse_pass", // - "embedding_eltwise_layernorm_fuse_pass", // - "multihead_matmul_fuse_pass_v2", // - "gpu_cpu_squeeze2_matmul_fuse_pass", // - "gpu_cpu_reshape2_matmul_fuse_pass", // - "gpu_cpu_flatten2_matmul_fuse_pass", // - "gpu_cpu_map_matmul_v2_to_mul_pass", // - "gpu_cpu_map_matmul_v2_to_matmul_pass", // - "gpu_cpu_map_matmul_to_mul_pass", // - "fc_fuse_pass", // - "fc_elementwise_layernorm_fuse_pass", // + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + "embedding_eltwise_layernorm_fuse_pass", // + "multihead_matmul_fuse_pass_v2", // + "gpu_cpu_squeeze2_matmul_fuse_pass", // + "gpu_cpu_reshape2_matmul_fuse_pass", // + "gpu_cpu_flatten2_matmul_fuse_pass", // + "gpu_cpu_map_matmul_v2_to_mul_pass", // + "gpu_cpu_map_matmul_v2_to_matmul_pass", // + "gpu_cpu_map_matmul_to_mul_pass", // + "fc_fuse_pass", // + "fc_elementwise_layernorm_fuse_pass", // #if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be // guaranteed at least v7 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we @@ -236,14 +232,12 @@ void CpuPassStrategy::EnableMKLDNN() { passes_.insert(passes_.begin(), "mkldnn_placement_pass"); for (auto &pass : std::vector({ - "depthwise_conv_mkldnn_pass", // - "conv_bn_fuse_pass", // Execute BN passes again to - "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_transpose_bn_fuse_pass", // - "conv_transpose_eltwiseadd_bn_fuse_pass", // - "conv_bias_mkldnn_fuse_pass", // + "depthwise_conv_mkldnn_pass", // + "conv_bn_fuse_pass", // Execute BN passes again to + "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order + "conv_transpose_bn_fuse_pass", // + "conv_transpose_eltwiseadd_bn_fuse_pass", // + "conv_bias_mkldnn_fuse_pass", // "conv_transpose_bias_mkldnn_fuse_pass", // TODO(baoachun): Need to support 5-dimensional input. // "conv3d_bias_mkldnn_fuse_pass", // diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 5c7dd0e2561fa41313b2e65a443a9e4913a39961..eb51215790bbcdbc9e7d0c3adad482d9a69324b9 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext; ops::CastOpKernel>, \ ops::CastOpKernel>, ##__VA_ARGS__); -#if !defined(PADDLE_WITH_HIP) // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel) -#else -REGISTER_CAST_CUDA_BASE(transfer_dtype) -#endif diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 55de4087f579460fa6080733f3e2f02bb082b015..1da7798ea2696516759ac49b8ce459459e74066b 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -18,7 +18,9 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" #ifdef PADDLE_WITH_MKLDNN @@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat"); - - auto inputs_dims = ctx->GetInputsDim("X"); - - const size_t inputs_num = inputs_dims.size(); - PADDLE_ENFORCE_GT( - inputs_num, static_cast(0), - platform::errors::InvalidArgument( - "The number of input tensors in concat op should > 0. But " - "received inputs' length is 0.")); - if (inputs_num == 1) { - VLOG(3) << "Warning: concat op have only one input, may waste memory"; - } - - if (ctx->HasInput("AxisTensor")) { - auto out_dims = - phi::make_ddim(std::vector(inputs_dims[0].size(), -1)); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } else { - size_t axis = - ComputeAxis(static_cast(ctx->Attrs().Get("axis")), - static_cast(inputs_dims[0].size())); - framework::DDim out_dims = - phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis); - if (out_dims[axis] < 0) { - out_dims[axis] = -1; - } - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor, + PT_INFER_META(phi::ConcatInferMeta)); + REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, ops::ConcatGradOpMaker, - ops::ConcatGradOpMaker); + ops::ConcatGradOpMaker, + ConcatInferShapeFunctor); REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad, ops::ConcatDoubleGradOpMaker, ops::ConcatDoubleGradOpMaker, diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 1a2df2a0c7ba34f67ecb7c2ade002fcb4475229f..a974f2ec335487e0fbc12a578c0d80d6856e418e 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -20,5 +20,5 @@ else() endif() file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") -file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n") +file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n") file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n") diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc index a4262d405435ae31c2a5ad681ab443889ec5d393..4d11cb5ff74e69e991271d2a566dbc9344d35da2 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cc +++ b/paddle/fluid/operators/controlflow/logical_op.cc @@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/logical_op.h" #include #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" namespace paddle { namespace operators { @@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp { ::paddle::framework::EmptyGradOpMaker); REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$"); -REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU, - paddle::operators::LogicalAndFunctor); REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$"); -REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU, - paddle::operators::LogicalOrFunctor); REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); -REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, - paddle::operators::LogicalNotFunctor); REGISTER_BINARY_LOGICAL_OP(logical_xor, "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$"); -REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, - paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu deleted file mode 100644 index d88658607ed275808d64dddf4a60d52d4f995e73..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/logical_op.cu +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/controlflow/logical_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" - -namespace paddle { -namespace operators { - -template -class BinaryLogicalOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using InT = typename Functor::ELEMENT_TYPE; - using OutT = bool; - - auto functor = Functor(); - std::vector ins; - std::vector outs; - const auto& cuda_ctx = - ctx.template device_context(); - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - - if (ins.size() == 1) { - paddle::operators::LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \ - REGISTER_OP_CUDA_KERNEL( \ - op_name, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>); - -REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor) -#undef REGISTER_LOGICAL_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h deleted file mode 100644 index 15cd643a858cc018e3007fa90ec479900cd243be..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/logical_op.h +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -#define LOGICAL_BINARY_FUNCTOR(func_name, op) \ - template \ - struct func_name { \ - using ELEMENT_TYPE = T; \ - HOSTDEVICE bool operator()(const T a, const T b) const { \ - return static_cast(a) op static_cast(b); \ - } \ - }; - -LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||) -LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&) -LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^) -#undef LOGICAL_BINARY_FUNCTOR - -template -struct LogicalNotFunctor { - using ELEMENT_TYPE = T; - HOSTDEVICE bool operator()(const T a) const { return !a; } -}; - -template -class BinaryLogicalOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEMENT_TYPE; - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - Functor binary_func; - ElementwiseComputeEx(context, x, y, -1, - binary_func, out); - } -}; - -template -class UnaryLogicalOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEMENT_TYPE; - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - Functor unary_func; - platform::Transform trans; - trans(context.template device_context(), x->data(), - x->data() + x->numel(), - out->mutable_data(context.GetPlace()), unary_func); - } -}; - -} // namespace operators -} // namespace paddle - -#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>); - -#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc index 02f95254035d6041ef64dd746faa924abb053165..c3d7df8d0274371a4c5a482624c75b36677778a9 100644 --- a/paddle/fluid/operators/controlflow/logical_op_npu.cc +++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/logical_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc index ed2b09796eeeb8ce18fdc47be58347d85e6e1a80..a86a3bb35927d53d20bef91a0bf36695a268c348 100644 --- a/paddle/fluid/operators/dot_op.cc +++ b/paddle/fluid/operators/dot_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/dot_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"), - platform::errors::PreconditionNotMet( - "Input(X) of DotOp should not be null.")); - PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"), - platform::errors::PreconditionNotMet( - "Input(Y) of DotOp should not be null.")); - PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"), - platform::errors::PreconditionNotMet( - "Output(Out) of DotOp should not be null.")); - - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = static_cast(x_dims.size()); - PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank, - platform::errors::PreconditionNotMet( - "ShapeError: The dimensions of input tensor X (%s) " - "should be 1 or 2", - x_dims.to_str())); - - auto y_dims = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_EQ( - true, x_rank == (size_t)y_dims.size(), - platform::errors::PreconditionNotMet( - "ShapeError: The shape of input tensor Y: %s should match with " - "input tenosr X: %s", - y_dims.to_str(), x_dims.to_str())); - bool shape_match = true; - for (size_t i = 0; i < x_rank; ++i) { - if (x_dims[i] != y_dims[i]) { - shape_match = false; - break; - } - } - - PADDLE_ENFORCE_EQ(true, shape_match, - platform::errors::PreconditionNotMet( - "ShapeError: The shape of input tensor X: %s should " - "be exactly the same " - "with input tensor Y: %s", - x_dims.to_str(), y_dims.to_str())); - auto dims = vectorize(x_dims); - dims[dims.size() - 1] = 1; - ctx->SetOutputDim("Out", phi::make_ddim(dims)); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( @@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor, + PT_INFER_META(phi::DotInferMeta)); + REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker, ops::DotOpGradMaker, - ops::DotOpGradMaker); + ops::DotOpGradMaker, + DotInferShapeFunctor); REGISTER_OPERATOR(dot_grad, ops::DotGradOp); diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index cf4d7b1d670b8add6ff5a138851c6a23ee54169e..8a405cc6fc1baefe997fb5b6133a56d6a2fc0438 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, ops::GatherOpKernel, ops::GatherOpKernel, ops::GatherOpKernel, - ops::GatherOpKernel); + ops::GatherOpKernel, + ops::GatherOpKernel); REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel); + ops::GatherGradientOpKernel, + ops::GatherGradientOpKernel); REGISTER_OP_VERSION(gather) .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 19568835a6e96080bb1c0af642bf9cb19c346bf9..a502a13040949a34e88a4d585327a58ffe92562c 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -130,9 +130,11 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel); + ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel); REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel); + ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 72a90d17998d84f0d0d4e081543acae94756e635..b376334f1e93cc3be9e716d808525011edb29b94 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -29,6 +29,7 @@ namespace operators { using DataLayout = framework::DataLayout; enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 }; +#define ALIGN_BYTES 16 #define CHECK_CASE(i, flags, kernel_name, ...) \ if (i == flags) { \ @@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { template __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, int imsize, int groups, - int group_size, T* mean, T* var, - const DataLayout data_layout) { + int group_size, T* mean, T* var) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, T x_mean = 0, x_var = 0; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val; - if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid]; - } else { - int hid = imid / W; - int wid = imid % W; - val = x[(bid * H + hid) * W * C + wid * C + ccid]; - } + int hid = imid / W; + int wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid]; + x_mean += val; x_var += val * val; } @@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); } +template +__device__ __forceinline__ void ThreadReduce(const T* input, int size, + const int offset, AccT* mean, + AccT* var) { + using VecT = kps::details::VectorType; + int tid = threadIdx.x; + if (offset > 0) { + input -= offset; + size += offset; + if (tid >= offset) { + AccT temp = input[tid]; + *mean += temp; + *var += temp * temp; + } + size -= blockDim.x; + input += blockDim.x; + } + int remain = size % (VecSize * blockDim.x); + + T ins[VecSize]; + VecT* ins_vec = reinterpret_cast(&ins); + + // vector part + for (; VecSize * tid < (size - remain); tid += blockDim.x) { + *ins_vec = reinterpret_cast(input)[tid]; + +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + AccT temp = ins[i]; + *mean += temp; + *var += temp * temp; + } + } + + // scalar part + tid = size - remain + threadIdx.x; + for (; tid < size; tid += blockDim.x) { + AccT temp = input[tid]; + *mean += temp; + *var += temp * temp; + } +} + +template +__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) { + int i = blockIdx.x; + T x_mean = 0, x_var = 0; + for (int j = threadIdx.x; j < size; j += blockDim.x) { + T val; + val = x[i * size + j]; + x_mean += val; + x_var += val * val; + } + x_mean /= size; + x_var /= size; + CudaAtomicAddWithWarp(&mean[i], x_mean); + CudaAtomicAddWithWarp(&var[i], x_var); +} + +template +__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var, + int size) { + int i = blockIdx.x; + AccT x_mean = static_cast(0); + AccT x_var = static_cast(0); + const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); + x += i * size; + ThreadReduce(x, size, input_offset, &x_mean, &x_var); + x_mean = kps::details::BlockXReduce>( + x_mean, kps::AddFunctor()); + x_var = kps::details::BlockXReduce>( + x_var, kps::AddFunctor()); + __syncthreads(); + if (threadIdx.x == 0) { + mean[i] = static_cast(x_mean / size); + var[i] = static_cast(x_var / size); + } +} + template __global__ void GroupNormForward(const T* x, const T* mean, const T* var, const T* scale, const T* bias, int N, int C, @@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var, int H = imsize / W; int ccid = gid * group_size + cid; if (ccid >= C) return; - T x_mean = mean[bid * groups + gid]; - T x_var = var[bid * groups + gid]; + auto ng = bid * groups + gid; + T x_mean = mean[ng]; + T x_var = var[ng]; x_var = x_var - x_mean * x_mean; - T var_inv = 1.0 / sqrt(x_var + epsilon); - if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var; + T var_inv = rsqrt(x_var + epsilon); + if (cid == 0 && threadIdx.x == 0) { + real_var[ng] = x_var; + } for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val; int hid, wid; + int index = (bid * C + ccid) * imsize + imid; if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid]; + val = x[index]; } else { hid = imid / W; wid = imid % W; val = x[(bid * H + hid) * W * C + wid * C + ccid]; } val = (val - x_mean) * var_inv; - if (flags & kHasScale) val *= scale[gid * group_size + cid]; - if (flags & kHasBias) val += bias[gid * group_size + cid]; + if (flags & kHasScale) { + val *= scale[ccid]; + } + if (flags & kHasBias) { + val += bias[ccid]; + } if (data_layout == DataLayout::kNCHW) { - y[(bid * C + ccid) * imsize + imid] = val; + y[index] = val; } else { y[(bid * H + hid) * W * C + wid * C + ccid] = val; } @@ -182,16 +266,41 @@ class GroupNormKernel imsize *= x_dims[i]; } } + #ifdef __HIPCC__ int block_size = std::max(std::min(256, imsize), 64); #else int block_size = std::min(1024, imsize); #endif + dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); - GroupNormForwardGetMeanAndVar<<>>( - x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data, - temp_var_data, data_layout); + if (data_layout == DataLayout::kNCHW) { + using AccT = typename details::MPTypeTrait::Type; + constexpr int vec_size = sizeof(float4) / sizeof(T); + int size = group_size * imsize; + const int max_num_threads = 1024; + int max_block_size = std::min(size / vec_size, max_num_threads); + int block_size_nchw = 1; + while (block_size_nchw < max_block_size) { + block_size_nchw *= 2; + } + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); + dim3 grids(x_dims[0] * groups); + dim3 blocks(block_size_nchw); + if (size < vec_size) { + ScalarGetMeanAndVarNCHW<<>>( + x_data, mean_data, temp_var_data, size); + } else { + VectorizedGetMeanAndVarNCHW< + T, AccT, vec_size><<>>( + x_data, mean_data, temp_var_data, size); + } + } else { + GroupNormForwardGetMeanAndVar<<>>( + x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data, + temp_var_data); + } int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data, diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc index 2d97797cfec21ed50f0999fa13f8bb1ae9618b71..68d002fceea70fd032d7613802d095770d3d4754 100644 --- a/paddle/fluid/operators/index_sample_op.cc +++ b/paddle/fluid/operators/index_sample_op.cc @@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_sample_op.h" #include #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" -#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker { @@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker { class IndexSampleOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Inputs(Input) of FindByIndex should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Inputs(Index) of FindByIndex should not be null.")); - - auto input_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - input_dims.size(), 2, - platform::errors::InvalidArgument( - "Inputs(X) shape of IndexSample op should be 2-D, but " - "got X's shape = [%s], please check X shape.", - input_dims)); - - auto index_dims = ctx->GetInputDim("Index"); - PADDLE_ENFORCE_EQ( - input_dims.size(), 2, - platform::errors::InvalidArgument( - "Inputs(Index) shape of IndexSample op should be 2-D, but " - "got Index's shape [%s] , please check index shape.", - input_dims)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0], - platform::errors::InvalidArgument( - "Inputs(X)'s value of dimension 0 must same with " - "Inputs(Index)'s value of dimension 0, but " - "got %d of Inputs(X), and got %d of Inputs(Index), " - "please check Inputs shape.", - input_dims[0], index_dims[0])); - } - ctx->SetOutputDim("Out", index_dims); - auto type = ctx->GetInputsVarType("Index")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("Index", /*->*/ "Out"); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor, + PT_INFER_META(phi::IndexSampleInferMeta)); REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker, ops::IndexSampleGradMaker, - ops::IndexSampleGradMaker); + ops::IndexSampleGradMaker, + IndexSampleInferShapeFunctor); REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp, ops::IndexSampleGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - index_sample, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel); -REGISTER_OP_CPU_KERNEL( - index_sample_grad, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel); diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu deleted file mode 100644 index e8acbfb8be990a422e5a16e8871d47f55af6620c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/index_sample_op.cu +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/index_sample_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#define PREDEFINED_BLOCK_SIZE_X 512 -#define PREDEFINED_BLOCK_SIZE 1024 -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - -namespace paddle { -namespace operators { - -namespace { -void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) { - auto max_grid_dim = ctx.template device_context() - .GetCUDAMaxGridDimSize(); - grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; - grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; -} -} - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void IndexSampleForward(const IndexT* index, const T* in_data, - T* out_data, size_t index_length, - size_t input_length, size_t batch_size) { - unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; - for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { - index_i = blockDim.x * blockIdx.x + threadIdx.x; - for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { - unsigned int index_idx = index_j * index_length + index_i; - unsigned int in_idx = index_j * input_length + index_i; - IndexT sample_idx = index[index_idx]; - out_data[index_idx] = in_data[in_idx - index_i + sample_idx]; - } - } -} - -template -__global__ void IndexSampleGrad(const IndexT* index, T* in_grad, - const T* out_grad, size_t index_length, - size_t input_length, size_t batch_size, - bool same_data_in_row = true) { - unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; - - for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { - index_i = blockDim.x * blockIdx.x + threadIdx.x; - for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { - unsigned int index_idx = index_j * index_length + index_i; - unsigned int in_idx = index_j * input_length + index_i; - IndexT sample_idx = index[index_idx]; - if (same_data_in_row) { - platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]), - out_grad[sample_idx]); - } else { - in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx]; - } - } - } -} - -template -class IndexSampleKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* index = ctx.Input("Index"); - auto* output = ctx.Output("Out"); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - const auto* in_data = input->data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context().stream(); - - auto input_dim = input->dims(); - auto index_dim = index->dims(); - size_t batch_size = input_dim[0]; - size_t input_length = input_dim[1]; - size_t index_length = index_dim[1]; - - auto block_width = platform::RoundToPowerOfTwo(index_length); - block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); - int block_height = - platform::RoundToPowerOfTwo(index_length * batch_size) / block_width; - block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); - dim3 block_dim(block_width, block_height); - dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, - (batch_size + block_dim.y - 1) / block_dim.y); - LimitGridDim(ctx, &grid_dim); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - IndexSampleForward<<>>( - index_data, in_data, out_data, index_length, input_length, - batch_size); - } else if (index_type == framework::proto::VarType::INT32) { - const int* index_data = index->data(); - IndexSampleForward<<>>( - index_data, in_data, out_data, index_length, input_length, - batch_size); - } - } -}; - -template -class IndexSampleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* index = ctx.Input("Index"); - - const auto* output_grad_data = output_grad->data(); - auto* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - auto stream = - ctx.template device_context().stream(); - auto input_num = input_grad->numel(); - auto input_dim = input_grad->dims(); - auto index_dim = index->dims(); - size_t batch_size = index_dim[0]; - size_t input_length = input_dim[1]; - size_t index_length = index_dim[1]; - bool same_data_in_index_row = index_length == 1 ? false : true; - - auto block_width = platform::RoundToPowerOfTwo(index_length); - block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); - auto block_height = - platform::RoundToPowerOfTwo(index_length * batch_size) / block_width; - block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); - dim3 block_dim(block_width, block_height); - dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, - (batch_size + block_dim.y - 1) / block_dim.y); - LimitGridDim(ctx, &grid_dim); - - phi::funcs::SetConstant set_zero; - auto& dev_ctx = ctx.template device_context(); - set_zero(dev_ctx, input_grad, static_cast(0)); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - IndexSampleGrad<<>>( - index_data, input_grad_data, output_grad_data, index_length, - input_length, batch_size, same_data_in_index_row); - } else if (index_type == framework::proto::VarType::INT32) { - const int* index_data = index->data(); - IndexSampleGrad<<>>( - index_data, input_grad_data, output_grad_data, index_length, - input_length, batch_size, same_data_in_index_row); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - index_sample, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel); -REGISTER_OP_CUDA_KERNEL( - index_sample_grad, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel); diff --git a/paddle/fluid/operators/index_sample_op.h b/paddle/fluid/operators/index_sample_op.h deleted file mode 100644 index 6cc8ff04c544554e805c605783c9bedf1b9fcb7b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/index_sample_op.h +++ /dev/null @@ -1,198 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DDim = framework::DDim; - -template -void IndexSampleInner(const framework::ExecutionContext &context, - const LoDTensor &input, const LoDTensor &index, - LoDTensor *output) { - auto input_dims = input.dims(); - auto index_dims = index.dims(); - - int batch_size = input_dims[0]; - auto value_length = input_dims[1]; - auto index_length = index_dims[1]; - int index_ids_num = index.numel(); - - std::vector input_vec; - std::vector index_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &input_vec); - paddle::framework::TensorToVector(index, context.device_context(), - &index_vec); - - std::vector res(index_ids_num); - for (int i = 0; i < index_ids_num; i++) { - int b = floor(i / index_length); - PADDLE_ENFORCE_GE( - index_vec[i], 0, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - PADDLE_ENFORCE_LT( - index_vec[i], value_length, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - - int v_i = b * value_length + static_cast(index_vec[i]); - T v = input_vec[v_i]; - VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i - << " value = " << v; - res[i] = v; - } - - auto ddim = phi::make_ddim({batch_size, index_length}); - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(res, context.device_context(), output); - output->Resize(ddim); -} - -template -class IndexSampleKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *input_var = ctx.InputVar("X"); - auto *index_var = ctx.InputVar("Index"); - - auto &input_tensor = input_var->Get(); - auto &index_tensor = index_var->Get(); - - auto *out_var = ctx.OutputVar("Out"); - auto *out_tensor = out_var->GetMutable(); - - const auto &index_type = - framework::TransToProtoVarType(index_tensor.dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleInner(ctx, input_tensor, index_tensor, out_tensor); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSampleInner(ctx, input_tensor, index_tensor, out_tensor); - } - } -}; - -template -void IndexSampleGradInner(const framework::ExecutionContext &context, - const LoDTensor &out_grad, const LoDTensor &index, - LoDTensor *x_grad) { - std::vector out_grad_vec; - std::vector index_vec; - paddle::framework::TensorToVector(out_grad, context.device_context(), - &out_grad_vec); - paddle::framework::TensorToVector(index, context.device_context(), - &index_vec); - - auto index_dims = index.dims(); - auto x_grad_dims = x_grad->dims(); - - auto value_length = x_grad_dims[1]; - auto index_length = index_dims[1]; - int index_ids_num = index.numel(); - - std::vector x_grad_vec(x_grad->numel(), 0); - - for (int i = 0; i < index_ids_num; i++) { - int b = floor(i / index_length); - PADDLE_ENFORCE_GE( - index_vec[i], 0, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample_grad) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - PADDLE_ENFORCE_LT( - index_vec[i], value_length, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample_grad) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - int v_i = b * value_length + static_cast(index_vec[i]); - x_grad_vec[v_i] += out_grad_vec[i]; - } - x_grad->mutable_data(context.GetPlace()); - framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad); - x_grad->Resize(x_grad_dims); -} - -template -class IndexSampleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *index_var = context.InputVar("Index"); - auto *x_grad_var = context.OutputVar(framework::GradVarName("X")); - auto *out_grad_var = context.InputVar(framework::GradVarName("Out")); - - auto &index_tensor = index_var->Get(); - auto &out_grad_tensor = out_grad_var->Get(); - auto *x_grad_tensor = x_grad_var->GetMutable(); - - const auto &index_type = - framework::TransToProtoVarType(index_tensor.dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleGradInner(context, out_grad_tensor, index_tensor, - x_grad_tensor); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSampleGradInner(context, out_grad_tensor, index_tensor, - x_grad_tensor); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc index f460d0622bccc2e71b1e147c0c9add688c3b11c4..38eb5b4514993412fa3a6c96ccc92e899c57b205 100644 --- a/paddle/fluid/operators/index_sample_op_npu.cc +++ b/paddle/fluid/operators/index_sample_op_npu.cc @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_sample_op.h" - +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index b31c7a1cde0f18edb00435805ce4b2a089f9eb1a..62c21dd2eee401e5f8a526870015c18cf13ee873 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -474,11 +474,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( for (int it = 0; it < LDGS; it++) { #pragma unroll for (int jt = 0; jt < VecSize; jt++) { - U x_tmp = x[it][jt]; + U x_tmp = static_cast(x[it][jt]); U y_tmp = var_cur_row * (x_tmp - mean_cur_row); U dy_tmp = static_cast(gamma[it][jt]) * - static_cast(dout[it][jt]); // scale * dy - U dout_tmp = dout[it][jt]; // dy + static_cast(dout[it][jt]); // scale * dy + U dout_tmp = static_cast(dout[it][jt]); // dy // used for get dx (row reduction) sum_loss1 += dy_tmp; // scale * dy, sum_1 diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index d439b3220d96ecd1107d6c29850d3d5356a01e09..dfe73d3727132ae9b8f71e2a415ef5193f303493 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL( ops::LayerNormGradKernel, ops::LayerNormGradKernel); +#elif CUDNN_VERSION_MIN(8, 1, 0) +REGISTER_OP_CUDA_KERNEL( + layer_norm, + ops::LayerNormKernel, + ops::LayerNormKernel, + ops::LayerNormKernel, + ops::LayerNormKernel); +REGISTER_OP_CUDA_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); #else REGISTER_OP_CUDA_KERNEL( layer_norm, diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index fcd5c06a6f310f8a23608a77f2d6b9098e99b33a..5ac39953462b5078aa663a7f39f5eb95c96bae7a 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/operators/mkldnn/axpy_handler.h" @@ -502,32 +503,29 @@ struct MergeAdd { out.mutable_value()->mutable_data( phi::make_ddim({static_cast(merge_rows.size()), input_width}), context.GetPlace()); - int r = - xpu::constant(context.x_context(), out.mutable_value()->data(), - merge_rows.size() * input_width, static_cast(0.f)); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU constant op return" - " wrong value[%d %s].", - r, XPUAPIErrorMsg[r])); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { rows_to_id[merge_rows[i]] = i; } - auto* out_data = out.mutable_value()->data(); - auto* input_data = input.value().data(); + auto* y_data = out.mutable_value()->data(); + auto* x_data = input.value().data(); + int xm = input_rows.size(); + int ym = merge_rows.size(); int n = input_width; - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_to_id[input_rows[i]]; - auto r = xpu::add(context.x_context(), &input_data[i * input_width], - &out_data[out_i * input_width], - &out_data[out_i * input_width], n); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU API return wrong value[%d %s], ", r, - XPUAPIErrorMsg[r])); - } + + xpu::ctx_guard RAII_GUARD(context.x_context()); + int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm(xm); + int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm(ym); + memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(), + merge_rows.data(), ym * sizeof(int64_t)); + memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(), + input_rows.data(), xm * sizeof(int64_t)); + int r = + xpu::merge_dup_rows(context.x_context(), x_data, y_data, + x_rows_data, y_rows_data, xm, n, ym); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows"); } void operator()(const platform::XPUDeviceContext& context, @@ -582,15 +580,7 @@ struct MergeAdd { {static_cast(merged_row_set.size()), input_width}), context.GetPlace()); - int r = - xpu::constant(context.x_context(), out.mutable_value()->data(), - merge_rows.size() * input_width, static_cast(0.f)); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU constant op return" - " wrong value[%d %s].", - r, XPUAPIErrorMsg[r])); - - float* out_data = reinterpret_cast(out.mutable_value()->data()); + float* y_data = reinterpret_cast(out.mutable_value()->data()); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { @@ -603,17 +593,22 @@ struct MergeAdd { } auto& input_rows = input->rows(); + auto* x_data = input->value().data(); + int xm = input_rows.size(); + int ym = merge_rows.size(); int n = input_width; - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_to_id[input_rows[i]]; - auto r = xpu::add( - context.x_context(), input->value().data() + i * input_width, - &out_data[out_i * input_width], &out_data[out_i * input_width], n); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU API return wrong value[%d %s], ", r, - XPUAPIErrorMsg[r])); - } + + xpu::ctx_guard RAII_GUARD(context.x_context()); + int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm(xm); + int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm(ym); + memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(), + merge_rows.data(), ym * sizeof(int64_t)); + memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(), + input_rows.data(), xm * sizeof(int64_t)); + int r = + xpu::merge_dup_rows(context.x_context(), x_data, y_data, + x_rows_data, y_rows_data, xm, n, ym); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows"); } } }; diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 8563d8b05b186c025ecc4c970a400765adeb0c5d..a4678550cf7bd0d4aa2759d4887dddabed5f9ba4 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -445,6 +446,7 @@ template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; +template struct MergeAdd; template struct MergeAdd>; template struct MergeAdd>; diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc index 28c6efef14178535d7f9473c2310552037952c9f..efec50efa92ea68cb68934bde32e1f56570b0868 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc @@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker "The fp32 beta1 power accumulator tensor. Its shape is [1]."); AddOutput("Beta2Pow", "The fp32 beta2 power accumulator tensor. Its shape is [1]."); - AddOutput("FusedIndices", - "The param index of each element in FP32FusedParam. Its shape is " - "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...]."); AddOutput( "FusedParamOffsets", "The numel offset of each parameter inside the FP32FusedParam. Its " "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 " - "+ n_2, ...]."); - AddOutput("FP32ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp32_local_param_num + 1]."); - AddOutput("FP16ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp16_local_param_num + 1]."); + "+ n_2, ...]. It should be in CPUPlace."); AddOutput( - "WeightDecay", - "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N]."); + "FP32ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace."); + AddOutput( + "FP16ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace."); AddOutput("ParamInfo", "The param info. It should be in CPUPlace, and its shape is [6]" - "CPUPlace, and its shape is [6]. It is " + "CPUPlace, and its shape is [8]. It is " "[fp32_shard_param_start_idx, fp32_local_param_num, " - "fp32_global_param_num, fp16_shard_param_start_idx, " - "fp16_local_param_num, fp16_global_param_num]."); - + "fp32_global_param_num, fp32_weight_decay_end_idx, " + "fp16_shard_param_start_idx, " + "fp16_local_param_num, fp16_global_param_num, " + "fp16_weight_decay_end_idx]."); + AddOutput("ParamOrder", + "The reordered parameter order. Inside this op, " + "the parameter would be reordered by data type and weight decay " + "value."); AddOutput("ParamOut", "The output parameter list.").AsDuplicable(); AddOutput("MasterParamOut", "The output master parameter list. It would share the memory of " @@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker AddAttr("beta1", "The initial value of Beta1Pow."); AddAttr("beta2", "The initial value of Beta2Pow."); - AddAttr>( - "weight_decay", - "The weight decay for each parameter. Its " - "shape is equal to the global parameter number."); + AddAttr>("apply_weight_decay", + "Whether to apply weight decay."); AddAttr("alignment", "The alignment in bytes for the fused tensors."); AddAttr("rank", "The global rank of the current process."); AddAttr("nranks", "The global world size."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index 3445e9b658becda84aa678e9c1f03b3436d63b70..7d8a7186d58b402e208fc749524d996b351abeef 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin, << ") , dtype = " << fused_out->dtype(); } -template -static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets, - IndexT *out, - int offset_num, - int out_num) { - CUDA_KERNEL_LOOP_TYPE(i, out_num, int) { - auto idx = phi::funcs::LowerBound(offsets, offset_num, i); - if (idx == offset_num || offsets[idx] != i) { - --idx; - } - out[i] = idx; - } -} - -template -static void CopyVectorToTensor(const std::vector &src, - framework::Tensor *dst, - const platform::Place &place, - gpuStream_t stream) { - dst->Resize({static_cast(src.size())}); - T *dst_ptr = dst->mutable_data(place); - const T *src_ptr = src.data(); - auto nbytes = src.size() * sizeof(T); - memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream); -} - template static void CopyVectorToCPUTensor(const std::vector &src, framework::Tensor *dst) { @@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector &src, std::memcpy(dst_ptr, src_ptr, nbytes); } +static size_t ReorderParamGradInfoList(const std::vector &flags, + std::vector *infos) { + size_t n = infos->size(); + std::vector cur_flags; + cur_flags.reserve(n); + for (size_t i = 0; i < n; ++i) { + auto idx = (*infos)[i].idx; + cur_flags.push_back(flags[idx]); + } + + auto origin_infos = *infos; + size_t j = 0; + for (size_t i = 0; i < n; ++i) { + if (cur_flags[i]) { + (*infos)[j] = origin_infos[i]; + ++j; + } + } + size_t ret_idx = j; + + for (size_t i = 0; i < n; ++i) { + if (!cur_flags[i]) { + (*infos)[j] = origin_infos[i]; + ++j; + } + } + return ret_idx; +} + +template +static T ClipByBound(T x, T low_value, T high_value) { + if (x < low_value) return low_value; + if (x > high_value) return high_value; + return x; +} + template class DistributedFusedLambInitOpKernel : public framework::OpKernel { @@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel info->numel_offset = 0; // not determined yet } } + const auto &apply_weight_decay = + ctx.Attr>("apply_weight_decay"); + size_t fp32_wd_end_idx = + ReorderParamGradInfoList(apply_weight_decay, &fp32_infos); + size_t fp16_wd_end_idx = + ReorderParamGradInfoList(apply_weight_decay, &fp16_infos); + + auto *param_order_t = ctx.Output("ParamOrder"); + auto param_num = fp32_infos.size() + fp16_infos.size(); + param_order_t->Resize({static_cast(param_num)}); + auto *param_order = param_order_t->mutable_data(platform::CPUPlace()); + for (size_t i = 0; i < fp32_infos.size(); ++i) { + param_order[i] = static_cast(fp32_infos[i].idx); + } + for (size_t i = 0; i < fp16_infos.size(); ++i) { + param_order[i + fp32_infos.size()] = static_cast(fp16_infos[i].idx); + } + VLOG(10) << "Fill ParamGradInfo ends"; // Step 2: determine the numel_with_padding and numel_offset @@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel VLOG(10) << "Found the sharding arguments"; auto *param_info_t = ctx.Output("ParamInfo"); - param_info_t->Resize({6}); + param_info_t->Resize({8}); auto *param_info = param_info_t->mutable_data(platform::CPUPlace()); param_info[0] = static_cast(fp32_start_idx); param_info[1] = static_cast(fp32_local_param_num); param_info[2] = static_cast(fp32_infos.size()); - param_info[3] = static_cast(fp16_start_idx + fp32_infos.size()); - param_info[4] = static_cast(fp16_local_param_num); - param_info[5] = static_cast(fp16_infos.size()); + param_info[3] = ClipByBound(fp32_wd_end_idx, fp32_start_idx, + fp32_start_idx + fp32_local_param_num) - + static_cast(fp32_start_idx); + param_info[4] = static_cast(fp16_start_idx + fp32_infos.size()); + param_info[5] = static_cast(fp16_local_param_num); + param_info[6] = static_cast(fp16_infos.size()); + param_info[7] = ClipByBound(fp16_wd_end_idx, fp16_start_idx, + fp16_start_idx + fp16_local_param_num) - + static_cast(fp16_start_idx); VLOG(10) << "Start FP32 idx: " << param_info[0]; VLOG(10) << "Local FP32 param num: " << param_info[1]; VLOG(10) << "Global FP32 param num: " << param_info[2]; - VLOG(10) << "Start FP16 idx: " << param_info[3]; - VLOG(10) << "Local FP16 param num: " << param_info[4]; - VLOG(10) << "Global FP16 param num: " << param_info[5]; + VLOG(10) << "Start FP16 idx: " << param_info[4]; + VLOG(10) << "Local FP16 param num: " << param_info[5]; + VLOG(10) << "Global FP16 param num: " << param_info[6]; - // For WeightDecay, shard and perform H2D copy - const auto &origin_weight_decay = - ctx.Attr>("weight_decay"); - PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(), - platform::errors::InvalidArgument( - "The attr(weight_decay) should have the " - "same length with Input(Param).")); - std::vector shard_weight_decay; - shard_weight_decay.reserve(total_local_param_num); - for (size_t i = 0; i < fp32_local_param_num; ++i) { - shard_weight_decay.push_back( - origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]); - } - for (size_t i = 0; i < fp16_local_param_num; ++i) { - shard_weight_decay.push_back( - origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]); - } - - // For FusedIndices, launch CUDA kernel to do binary search - auto *fused_indices_t = ctx.Output("FusedIndices"); - fused_indices_t->Resize({static_cast(total_numel)}); - auto *fused_indices = fused_indices_t->mutable_data(place); std::vector numel_offsets; numel_offsets.reserve(params.size() + 1); for (const auto &info : fp32_infos) { @@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel "The numel_offsets number must be one larger than " "the parameter number.")); VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets); - auto *fused_param_offset_t = - ctx.Output("FusedParamOffsets"); - fused_param_offset_t->Resize({static_cast(numel_offsets.size())}); - auto *fused_param_offset = fused_param_offset_t->mutable_data(place); - memory::Copy(place, fused_param_offset, platform::CPUPlace(), - numel_offsets.data(), - numel_offsets.size() * sizeof(numel_offsets[0]), stream); - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel); - LambFillFusedIndicesCUDAKernel<<>>( - fused_param_offset, fused_indices, numel_offsets.size() - 1, - total_numel); - - std::vector lengths; - lengths.reserve(fp32_local_param_num + fp16_local_param_num); std::vector fp32_partial_numel_offsets; fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1); @@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel VLOG(10) << "FP32 Partial numel = [" << valid_start_n + fp32_infos[i].numel << "," << end_n + fp32_infos[i].numel; - lengths.push_back(end_n - valid_start_n); + auto len = end_n - valid_start_n; fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() + - lengths.back()); + len); } std::vector fp16_partial_numel_offsets; @@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel PADDLE_ENFORCE_NE(valid_start_n, end_n, platform::errors::InvalidArgument( "Indices sharding error. This may be a bug.")); - lengths.push_back(end_n - valid_start_n); + auto len = end_n - valid_start_n; fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() + - lengths.back()); + len); } CopyVectorToCPUTensor(numel_offsets, @@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel fp16_partial_numel_offsets, ctx.Output("FP16ShardFusedParamOffsets")); - // Fill the weight decay tensor - PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(), - platform::errors::InvalidArgument( - "Invalid weight decay sharding. This may be a bug.")); - std::vector wd_cpu; - for (size_t i = 0; i < shard_weight_decay.size(); ++i) { - int len = lengths[i]; - for (int j = 0; j < len; ++j) { - wd_cpu.push_back(shard_weight_decay[i]); - } - } - PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel, - platform::errors::InvalidArgument( - "Invalid weight decay sharding. This may be a bug.")); - CopyVectorToTensor(wd_cpu, ctx.Output("WeightDecay"), - place, stream); - auto *global_scale = ctx.Output("GlobalScale"); if (!global_scale->IsInitialized()) { TensorFillConstant(dev_ctx, global_scale, {1}, 1.0f); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc index e5b27446eb330aeb08e134332a5366c6c6ed2908..8f7c87912e93aa1bb3178d37afa641047e15a82b 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc @@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { "The fp32 beta1 power accumulator tensor. Its shape is [1]."); AddInput("Beta2Pow", "The fp32 beta2 power accumulator tensor. Its shape is [1]."); - AddInput("FusedIndices", - "The param index of each element in FP32FusedParam. Its shape is " - "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...]."); AddInput( "FusedParamOffsets", "The numel offset of each parameter inside the FP32FusedParam. Its " "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 " - "+ n_2, ...]."); - AddInput("FP32ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp32_local_param_num + 1]."); - AddInput("FP16ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp16_local_param_num + 1]."); - AddInput("WeightDecay", - "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N]."); + "+ n_2, ...]. It should be in CPUPlace."); + AddInput( + "FP32ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace."); + AddInput( + "FP16ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace."); AddInput("ParamInfo", "The param info. It should be in CPUPlace, and its shape is [6]" - "CPUPlace, and its shape is [6]. It is " + "CPUPlace, and its shape is [8]. It is " "[fp32_shard_param_start_idx, fp32_local_param_num, " - "fp32_global_param_num, fp16_shard_param_start_idx, " - "fp16_local_param_num, fp16_global_param_num]."); + "fp32_global_param_num, fp32_weight_decay_end_idx, " + "fp16_shard_param_start_idx, " + "fp16_local_param_num, fp16_global_param_num, " + "fp16_weight_decay_end_idx]."); + AddInput("ParamOrder", + "The reordered parameter order. Inside this op, " + "the parameter would be reordered by data type and weight decay " + "value."); AddInput("LearningRate", "The fp32 learning rate tensor. Its shape is [1]."); @@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { "max_global_grad_norm", "The maximum global gradient l2-norm value for clipping. If " "max_global_grad_norm <= 0, no clipping would be performed."); + AddAttr("weight_decay", "The weight decay value."); AddAttr("clip_after_allreduce", "Whether to clip before allreduce, only valid when the " "world size is larger than 1."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 3f90140f77282983f42ef03f736c35960239dd75..ca0828a6f6ab71a010ae35318fed23a8072aa69d 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -87,7 +87,7 @@ struct L2NormFunctor { } }; -template +template static __global__ void MultiTensorL2NormReduceAgainCUDAKernel( const InT *x, OutT *y, int max_chunk_num) { int tensor_id = blockIdx.x; @@ -100,11 +100,7 @@ static __global__ void MultiTensorL2NormReduceAgainCUDAKernel( } sum = BlockReduce(storage).Reduce(sum, cub::Sum()); if (threadIdx.x == 0) { - if (NeedSqrt) { - y[blockIdx.x] = static_cast(sqrtf(sum)); - } else { - y[blockIdx.x] = static_cast(sum); - } + y[blockIdx.x] = static_cast(sum); } } @@ -118,6 +114,7 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { constexpr int vec8 = alignof(platform::AlignedVector); constexpr int vec4 = alignof(platform::AlignedVector); constexpr int vec2 = alignof(platform::AlignedVector); + chunk_size *= sizeof(T); if (address % vec8 == 0 && chunk_size % vec8 == 0) { return std::min(8, valid_vec_size); } else if (address % vec4 == 0 && chunk_size % vec4 == 0) { @@ -129,27 +126,26 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { } } -#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \ - case __vec_size: { \ - constexpr int kVecSize = __vec_size; \ - __VA_ARGS__; \ - break; \ +#define PD_VEC_LAUNCH_KERNEL_CASE(__vec_size, ...) \ + case __vec_size: { \ + constexpr int kVecSize = __vec_size; \ + __VA_ARGS__; \ + break; \ } -#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...) \ - do { \ - switch (__vec_size) { \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \ - } \ +#define PD_VEC_LAUNCH_KERNEL(__vec_size, ...) \ + do { \ + switch (__vec_size) { \ + PD_VEC_LAUNCH_KERNEL_CASE(8, __VA_ARGS__); \ + PD_VEC_LAUNCH_KERNEL_CASE(4, __VA_ARGS__); \ + PD_VEC_LAUNCH_KERNEL_CASE(2, __VA_ARGS__); \ + PD_VEC_LAUNCH_KERNEL_CASE(1, __VA_ARGS__); \ + } \ } while (0) // TODO(zengjinle): which chunk_size is better? -template +template static void MultiTensorL2Norm(const platform::CUDAPlace &place, gpuStream_t stream, const InT *x, const int *offsets, int n, OutT *y, @@ -158,7 +154,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, constexpr int kNumTensor = MaxTensorNumPerLaunch; constexpr int kNumChunk = MaxChunkNumPerLaunch; - constexpr int kBlockDim = BlockDim; + constexpr int kBlockDim = 512; int max_chunk_num = -1; int vec_size = 8; @@ -181,22 +177,22 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, auto *tmp_out_ptr = tmp_out.Alloc(n * max_chunk_num); FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream); -#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL \ - do { \ - using FunctorT = L2NormFunctor; \ - VLOG(10) << __func__ << " " << typeid(InT).name() \ - << " VecSize = " << kVecSize; \ - MultiTensorApply( \ - FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \ - max_chunk_num); \ +#define PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL \ + do { \ + using FunctorT = L2NormFunctor; \ + VLOG(10) << __func__ << " " << typeid(InT).name() \ + << " VecSize = " << kVecSize; \ + MultiTensorApply( \ + FunctorT(), stream, offsets, n, chunk_size, kBlockDim, x, tmp_out_ptr, \ + max_chunk_num); \ } while (0) - PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL); -#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL + PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL); +#undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL - MultiTensorL2NormReduceAgainCUDAKernel<<>>( - tmp_out_ptr, y, max_chunk_num); + MultiTensorL2NormReduceAgainCUDAKernel< + MT, OutT, kBlockDim><<>>(tmp_out_ptr, y, + max_chunk_num); } template @@ -208,34 +204,17 @@ static void LogParamAndTrustRatioDivSquareNorm( auto tensors = ctx.MultiInput("Param"); if (tensors.empty()) return; + const auto *order = ctx.Input("ParamOrder")->data(); + size_t n = tensors.size(); auto place = tensors[0]->place(); auto pn_vec = ToVector(param_square_norm, n, place); auto tn_vec = ToVector(trust_ratio_div_square_norm, n, place); - std::vector fp32_indices, fp16_indices; - fp32_indices.reserve(n); - fp16_indices.reserve(n); - for (size_t i = 0; i < n; ++i) { - const auto *t = tensors[i]; - if (t->dtype() == phi::DataType::FLOAT32) { - fp32_indices.push_back(i); - } else if (t->dtype() == phi::DataType::FLOAT16) { - fp16_indices.push_back(i); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported data type %s.", t->dtype())); - } - } - - for (auto idx : fp16_indices) { - fp32_indices.push_back(idx); - } - const auto &names = ctx.GetOp().Inputs("Param"); - for (size_t i = 0; i < fp32_indices.size(); ++i) { - auto idx = fp32_indices[i]; + for (size_t i = 0; i < n; ++i) { + auto idx = order[i]; VLOG(LogLevel) << "Param " << tensors[idx]->dtype() << " " << names[idx] << " pn = " << pn_vec[i] << " , tn = " << tn_vec[i]; } @@ -353,7 +332,7 @@ static __global__ void CalcGradNormClipBeforeAllReduceScale( const T1 *__restrict__ global_scale, T1 max_global_grad_norm, const T1 *__restrict__ square_grad_norm, T1 *__restrict__ out1, T2 *__restrict__ out2, T1 clip_rescale_grad) { - T1 grad_norm = static_cast(sqrt(*square_grad_norm)) * clip_rescale_grad; + T1 grad_norm = static_cast(sqrtf(*square_grad_norm)) * clip_rescale_grad; T1 scale = global_scale[0] * max_global_grad_norm / (1e-6 + grad_norm); bool found_nan_inf = !isfinite(scale); if (scale >= 1 || found_nan_inf) { @@ -380,19 +359,24 @@ static __global__ void SetNanInfValueCUDAKernelTwoFlag(const bool *in_flag_p_1, ((*in_flag_p_1) || (*in_flag_p_2)) ? __int_as_float(0x7fffffffU) : 0.0f; } -// TODO(zengjinle): Vectorize this function -// NOTE: this method does not update Beta1Pow and Beta2Pow! -template -static __global__ void UpdateLambMoment( +template +static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( const T *__restrict__ param_p, const GradT *__restrict__ grad_p, const T *__restrict__ square_grad_norm_p, - const T *__restrict__ global_scale, const IndexT *__restrict__ indices, - const T *__restrict__ weight_decay_p, const T *__restrict__ beta1pow_p, + const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p, const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p, - T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, T beta1, T beta2, - T epsilon, T max_global_grad_norm, int num, T rescale_grad) { + T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf, + T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon, + T max_global_grad_norm, int num, T rescale_grad) { T square_grad_norm = *square_grad_norm_p; - if (!isfinite(square_grad_norm)) return; + bool need_update_found_inf = + (found_inf && threadIdx.x == 0 && blockIdx.x == 0); + if (!isfinite(square_grad_norm)) { + if (need_update_found_inf) *found_inf = true; + return; + } else if (need_update_found_inf) { + *found_inf = false; + } T scale = rescale_grad / global_scale[0]; if (max_global_grad_norm > 0) { @@ -406,27 +390,112 @@ static __global__ void UpdateLambMoment( T one_minus_beta1pow = 1 - beta1pow_p[0]; T one_minus_beta2pow = 1 - beta2pow_p[0]; - CUDA_KERNEL_LOOP(i, num) { - T p = param_p[i]; - T g = static_cast(grad_p[i]) * scale; - T weight_decay = weight_decay_p[i]; - T mom1 = mom1_p[i]; - T mom2 = mom2_p[i]; - - mom1 = beta1 * mom1 + (1 - beta1) * g; - mom2 = beta2 * mom2 + (1 - beta2) * g * g; - - T mom1_unbiased = mom1 / one_minus_beta1pow; - T mom2_unbiased = mom2 / one_minus_beta2pow; - T trust_ratio_div = - mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + weight_decay * p; - - mom1_p[i] = mom1; - mom2_p[i] = mom2; - trust_ratio_div_p[i] = trust_ratio_div; + int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + int stride = blockDim.x * gridDim.x * VecSize; + + for (; i + VecSize <= num; i += stride) { + platform::AlignedVector param_vec; + platform::AlignedVector grad_vec; + platform::AlignedVector weight_decay_vec; + platform::AlignedVector mom1_vec; + platform::AlignedVector mom2_vec; + platform::AlignedVector trust_ratio_div_vec; + + T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay; + if (cur_weight_decay != static_cast(0.0)) { + platform::Load(param_p + i, ¶m_vec); + } else { +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + param_vec[j] = static_cast(0); + } + } + platform::Load(grad_p + i, &grad_vec); + platform::Load(mom1_p + i, &mom1_vec); + platform::Load(mom2_p + i, &mom2_vec); + +#define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2, \ + __trust_ratio_div, __idx) \ + T p = __param[__idx]; \ + T g = static_cast(__grad[__idx]) * scale; \ + T mom1 = __mom1[__idx]; \ + T mom2 = __mom2[__idx]; \ + mom1 = beta1 * mom1 + (1 - beta1) * g; \ + mom2 = beta2 * mom2 + (1 - beta2) * g * g; \ + T mom1_unbiased = mom1 / one_minus_beta1pow; \ + T mom2_unbiased = mom2 / one_minus_beta2pow; \ + __trust_ratio_div[__idx] = \ + mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + cur_weight_decay * p; \ + __mom1[__idx] = mom1; \ + __mom2[__idx] = mom2; + +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_vec, grad_vec, mom1_vec, + mom2_vec, trust_ratio_div_vec, j); + } + + platform::Store(mom1_vec, mom1_p + i); + platform::Store(mom2_vec, mom2_p + i); + platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i); + } + + for (; i < num; ++i) { + T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay; + PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_p, grad_p, mom1_p, mom2_p, + trust_ratio_div_p, i); } } +template +static void MultiTensorUpdateLambMomentAndTrustRatioDiv( + const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n, + const T *param_p, const GradT *grad_p, const T *square_grad_norm_p, + const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p, + T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay, + int weight_decay_end_idx, T beta1, T beta2, T epsilon, + T max_global_grad_norm, T rescale_grad) { + if (n <= 0) return; + int numel = offsets[n] - offsets[0]; + PADDLE_ENFORCE_GE(weight_decay_end_idx, 0, + platform::errors::InvalidArgument( + "The weight decay end index should be >= 0.")); + PADDLE_ENFORCE_LE(weight_decay_end_idx, n, + platform::errors::InvalidArgument( + "The weight decay end index should be < %d.", n)); + auto weight_decay_end_numel = offsets[weight_decay_end_idx] - offsets[0]; + + int vec_size = GetChunkedVecSize(param_p, 0); + vec_size = std::min(vec_size, GetChunkedVecSize(grad_p, 0)); + vec_size = std::min(vec_size, GetChunkedVecSize(mom1_p, 0)); + vec_size = std::min(vec_size, GetChunkedVecSize(mom2_p, 0)); + vec_size = std::min(vec_size, GetChunkedVecSize(trust_ratio_div_p, 0)); + for (int i = 0; i < n; ++i) { + auto length = offsets[i + 1] - offsets[i]; + while (length % vec_size != 0) { + vec_size /= 2; + } + } + + VLOG(1) << __func__ << " VecSize = " << vec_size; + + auto stream = dev_ctx.stream(); + auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); + +#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \ + do { \ + UpdateLambMomentAndTrustRatioDivCUDAKernel<<< \ + config.block_per_grid, config.thread_per_block, 0, stream>>>( \ + param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \ + beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, \ + weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \ + max_global_grad_norm, numel, rescale_grad); \ + } while (0) + + PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL); +#undef PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL +} + template struct LambBetaPowUpdateOnceHelper { LambBetaPowUpdateOnceHelper(T *beta1pow, T *beta2pow, T beta1, T beta2) { @@ -468,33 +537,6 @@ struct LambBetaPowUpdateOnceHelper { HOSTDEVICE void UpdateBetaPows() const {} }; -template -struct LambFoundInfHelper { - public: - explicit LambFoundInfHelper(bool *found_inf) : found_inf_(found_inf) { - PADDLE_ENFORCE_NOT_NULL(found_inf, - platform::errors::InvalidArgument( - "The found_inf should not be nullptr.")); - } - - HOSTDEVICE void UpdateFoundInf(bool value) { *found_inf_ = value; } - - private: - bool *__restrict__ found_inf_; -}; - -template <> -struct LambFoundInfHelper { - public: - explicit LambFoundInfHelper(bool *found_inf) { - PADDLE_ENFORCE_EQ( - found_inf, nullptr, - platform::errors::InvalidArgument("The found_inf should be nullptr.")); - } - - HOSTDEVICE void UpdateFoundInf(bool) {} -}; - template struct LambParamHelper { LambParamHelper(T *param, MasterT *master_param) { @@ -509,12 +551,9 @@ struct LambParamHelper { master_param_ = master_param; } - HOSTDEVICE void SetParam(int i, MasterT updated_p) { - param_[i] = static_cast(updated_p); - master_param_[i] = updated_p; - } + HOSTDEVICE T *__restrict__ ParamPtr() { return param_; } - HOSTDEVICE MasterT GetParam(int i) { return master_param_[i]; } + HOSTDEVICE MasterT *__restrict__ MasterParamPtr() { return master_param_; } private: T *__restrict__ param_; @@ -538,158 +577,169 @@ struct LambParamHelper { param_ = param; } - HOSTDEVICE void SetParam(int i, MasterT updated_p) { - param_[i] = static_cast(updated_p); - } + HOSTDEVICE T *__restrict__ ParamPtr() { return param_; } - HOSTDEVICE MasterT GetParam(int i) { - return static_cast>(param_[i]); - } + HOSTDEVICE constexpr MasterT *MasterParamPtr() { return nullptr; } private: T *__restrict__ param_; }; -template -struct LambParamAndBetaPowsUpdateHelper - : public LambParamHelper, - public LambBetaPowUpdateOnceHelper, NeedUpdateBetaPow>, - public LambFoundInfHelper { - LambParamAndBetaPowsUpdateHelper( - ParamT *param, MasterT *master_param, MasterT *beta1pow, - MasterT *beta2pow, MasterT beta1, MasterT beta2, - bool *found_inf, const MasterT *trust_ratio_div, - const MasterT *lr, const IndexT *index, +template +struct LambUpdateParamAndBetaPowsFunctor { + DEVICE void operator()( + int tensor_id, int chunk_id, int offset, int size, + LambParamHelper param_helper, + const MasterT *trust_ratio_div, const MasterT *lr, const MasterT *param_square_norm, - const MasterT *trust_ratio_div_square_norm, - const MasterT *update_flag) - : LambParamHelper(param, master_param), - LambBetaPowUpdateOnceHelper, NeedUpdateBetaPow>( - beta1pow, beta2pow, beta1, beta2), - LambFoundInfHelper(found_inf), - trust_ratio_div(trust_ratio_div), - lr(lr), - index(index), - param_square_norm(param_square_norm), - trust_ratio_div_square_norm(trust_ratio_div_square_norm), - update_flag(update_flag) {} - - const MasterT *__restrict__ trust_ratio_div; - const MasterT *__restrict__ lr; - const IndexT *__restrict__ index; - const MasterT *__restrict__ param_square_norm; - const MasterT *__restrict__ trust_ratio_div_square_norm; - const MasterT *__restrict__ update_flag; -}; + const MasterT *trust_ratio_div_square_norm, const bool *found_inf, + LambBetaPowUpdateOnceHelper, NeedUpdateBetaPow> + betapow_helper) const { + if (*found_inf) return; + + using MT = MasterT; -template -static __global__ void LambUpdateParamAndBetaPowsCUDAKernel( - LambParamAndBetaPowsUpdateHelper - args, - int num) { - auto should_update = *args.update_flag; - if (!isfinite(should_update)) { - if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) { - args.UpdateFoundInf(true); + MT p_square_norm = param_square_norm[tensor_id]; + MT t_square_norm = trust_ratio_div_square_norm[tensor_id]; + MT lr_value = *lr; + MT ratio = (p_square_norm != static_cast(0) && + t_square_norm != static_cast(0) + ? lr_value * sqrtf(p_square_norm / t_square_norm) + : lr_value); + + int i; + int stride = blockDim.x * VecSize; + + ParamT *param = param_helper.ParamPtr() + offset; + MT *master_param = HasMasterParam ? param_helper.MasterParamPtr() + offset + : param_helper.MasterParamPtr(); + trust_ratio_div += offset; + + for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) { + platform::AlignedVector trust_ratio_div_vec; + platform::Load(trust_ratio_div + i, &trust_ratio_div_vec); + if (HasMasterParam) { + platform::AlignedVector master_param_vec; + platform::Load(master_param + i, &master_param_vec); + platform::AlignedVector param_vec; +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j]; + master_param_vec[j] = p; + param_vec[j] = static_cast(p); + } + platform::Store(master_param_vec, master_param + i); + platform::Store(param_vec, param + i); + } else { + platform::AlignedVector param_vec; + platform::Load(param + i, ¶m_vec); +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + MT p = static_cast(param_vec[j]) - ratio * trust_ratio_div_vec[j]; + param_vec[j] = static_cast(p); + } + platform::Store(param_vec, param + i); + } + } + + for (; i < size; ++i) { + if (HasMasterParam) { + MT p = master_param[i] - ratio * trust_ratio_div[i]; + master_param[i] = p; + param[i] = static_cast(p); + } else { + MT p = static_cast(param[i]) - ratio * trust_ratio_div[i]; + param[i] = static_cast(p); + } + } + + if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) { + betapow_helper.UpdateBetaPows(); } - return; - } else if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) { - args.UpdateFoundInf(false); } +}; - if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) { - args.UpdateBetaPows(); +// TODO(zengjinle): which block_dim and chunk_size would be better? +template +static void MultiTensorUpdateLambParamAndBetaPows( + const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n, + const MasterT *trust_ratio_div, const MasterT *lr, + const MasterT *param_square_norm, + const MasterT *trust_ratio_div_square_norm, const bool *found_inf, + ParamT *param, MasterT *master_param, MasterT *beta1pow, + MasterT *beta2pow, MasterT beta1, MasterT beta2, + int chunk_size = 65536) { + constexpr bool kHasMasterParam = + !(std::is_same>::value); + + bool has_beta_pow = (beta1pow != nullptr); + if (has_beta_pow) { + PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument( + "Beta2Pow should not be nullptr.")); + } else { + PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument( + "Beta2Pow should be nullptr.")); } - using MT = MasterT; + const int block_dim = 512; - MT lr_value = *args.lr; - CUDA_KERNEL_LOOP(i, num) { - MT p = args.GetParam(i); - MT t = args.trust_ratio_div[i]; - auto norm_idx = args.index[i]; - MT p_square_norm = args.param_square_norm[norm_idx]; - MT t_square_norm = args.trust_ratio_div_square_norm[norm_idx]; + int vec_size = 8; + for (int i = 0; i < n; ++i) { + int offset = offsets[i] - offsets[0]; + vec_size = + std::min(vec_size, GetChunkedVecSize(param + offset, chunk_size)); + if (kHasMasterParam) { + vec_size = std::min(vec_size, + GetChunkedVecSize(master_param + offset, chunk_size)); + } + vec_size = std::min( + vec_size, GetChunkedVecSize(trust_ratio_div + offset, chunk_size)); + } - MT p_norm = static_cast(sqrtf(p_square_norm)); - MT t_norm = static_cast(sqrtf(t_square_norm)); + VLOG(1) << __func__ << " VecSize = " << vec_size; - auto update = (p_norm != static_cast(0) && t_norm != static_cast(0)) - ? p_norm / t_norm - : static_cast(1); + constexpr auto kNumTensor = MaxTensorNumPerLaunch; + constexpr auto kNumChunk = MaxChunkNumPerLaunch; - MT updated_p = p - lr_value * update * t; - args.SetParam(i, updated_p); - } -} + auto stream = dev_ctx.stream(); +#define PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(__has_beta_pow) \ + do { \ + using FunctorT = \ + LambUpdateParamAndBetaPowsFunctor; \ + LambParamHelper param_helper(param, \ + master_param); \ + LambBetaPowUpdateOnceHelper, __has_beta_pow> \ + betapow_helper(beta1pow, beta2pow, beta1, beta2); \ + launcher.Launch(FunctorT(), param_helper, trust_ratio_div, lr, \ + param_square_norm, trust_ratio_div_square_norm, found_inf, \ + betapow_helper); \ + } while (0) -template -static void LambUpdateParamAndBetaPows( - const platform::CUDADeviceContext &dev_ctx, - const MasterT *trust_ratio_div, const MasterT *lr, - const IndexT *index, const MasterT *param_square_norm, - const MasterT *trust_ratio_div_square_norm, - const MasterT *update_flag, MasterT **beta1pow, - MasterT **beta2pow, bool **found_inf, MasterT beta1, - MasterT beta2, int num, ParamT *param, - MasterT *master_param, gpuStream_t stream) { - if (num == 0) return; - - bool has_master_param = !(std::is_same>::value); - auto has_beta_pow = (*beta1pow) != nullptr && (*beta2pow) != nullptr; - auto has_found_inf = (*found_inf) != nullptr; - -#define PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL( \ - __has_master_param, __has_beta_pow, __has_found_inf) \ - do { \ - LambParamAndBetaPowsUpdateHelper \ - helper(param, master_param, *beta1pow, *beta2pow, beta1, beta2, \ - *found_inf, trust_ratio_div, lr, index, param_square_norm, \ - trust_ratio_div_square_norm, update_flag); \ - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, num); \ - LambUpdateParamAndBetaPowsCUDAKernel<<< \ - config.block_per_grid, config.thread_per_block, 0, stream>>>(helper, \ - num); \ +#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE \ + do { \ + auto callback = [&]( \ + const MultiTensorLauncher &launcher, \ + int launch_n) { \ + if (has_beta_pow && launch_n == 0) { \ + PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true); \ + beta1pow = nullptr; \ + beta2pow = nullptr; \ + } else { \ + PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false); \ + } \ + }; \ + MultiTensorApplyWithCallback( \ + stream, offsets, n, chunk_size, block_dim, callback); \ } while (0) - if (has_master_param) { - if (has_beta_pow) { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, false); - } - } else { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, false); - } - } - } else { - if (has_beta_pow) { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, false); - } - } else { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, false); - } - } - } + PD_VEC_LAUNCH_KERNEL(vec_size, + PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE); - *beta1pow = nullptr; - *beta2pow = nullptr; - *found_inf = nullptr; -#undef PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL +#undef PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW +#undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -1005,15 +1055,16 @@ class DistributedFusedLambOpKernel "Too many parameter number. Only <= %d is supported.", std::numeric_limits::max())); - // Step 3: Get FusedIndices, ParamInfo - const auto *indices = GetInputTensorPtr(ctx, "FusedIndices"); + // Step 3: Get ParamInfo const auto *param_info_tensor = GetInputTensorPtr(ctx, "ParamInfo"); auto fp32_local_start_idx = param_info_tensor[0]; auto fp32_local_param_num = param_info_tensor[1]; auto fp32_global_param_num = param_info_tensor[2]; - auto fp16_local_start_idx = param_info_tensor[3]; - auto fp16_local_param_num = param_info_tensor[4]; - auto fp16_global_param_num = param_info_tensor[5]; + auto fp32_weight_decay_end_idx = param_info_tensor[3]; + auto fp16_local_start_idx = param_info_tensor[4]; + auto fp16_local_param_num = param_info_tensor[5]; + auto fp16_global_param_num = param_info_tensor[6]; + auto fp16_weight_decay_end_idx = param_info_tensor[7]; auto local_param_num = fp32_local_param_num + fp16_local_param_num; auto param_num = fp32_global_param_num + fp16_global_param_num; @@ -1031,7 +1082,7 @@ class DistributedFusedLambOpKernel << " , fp16_global_param_num = " << fp16_global_param_num; // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow, - // WeightDecay, GlobalScale, FoundInf + // GlobalScale, FoundInf const auto *global_scale = GetInputTensorPtr(ctx, "GlobalScale"); const auto *lr = GetInputTensorPtr(ctx, "LearningRate"); int64_t partial_numel = 0; @@ -1065,14 +1116,15 @@ class DistributedFusedLambOpKernel GetSameInOutTensorPtr(ctx, place, "Beta1Pow", "Beta1PowOut"); auto *beta2pow = GetSameInOutTensorPtr(ctx, place, "Beta2Pow", "Beta2PowOut"); - const float *weight_decay = GetInputTensorPtr(ctx, "WeightDecay"); auto *found_inf_t = ctx.Output("FoundInf"); found_inf_t->Resize({1}); auto *found_inf = found_inf_t->mutable_data(place); - // Step 5: Get attributes beta1, beta2, epsilon, max_grad_norm, ring_id, + // Step 5: Get attributes weight_decay, beta1, beta2, epsilon, + // max_grad_norm, ring_id, // use_master_param_norm, is_grad_scaled_by_nranks + auto weight_decay = ctx.Attr("weight_decay"); auto beta1 = ctx.Attr("beta1"); auto beta2 = ctx.Attr("beta2"); auto epsilon = ctx.Attr("epsilon"); @@ -1105,7 +1157,8 @@ class DistributedFusedLambOpKernel platform::float16 *fp16_sum_grad; auto fp32_numel_each_device = fp32_numel / num_devices; auto fp16_numel_each_device = fp16_numel / num_devices; - if (num_devices > 1) { + if (num_devices > 1 || + (max_global_grad_norm > 0 && !clip_after_allreduce)) { auto ptr = sum_grad_buffer.Alloc( fp32_numel_each_device * sizeof(float) + fp16_numel_each_device * sizeof(platform::float16)); @@ -1181,7 +1234,11 @@ class DistributedFusedLambOpKernel float, platform::float16><<<1, 1, 0, stream>>>( global_scale, max_global_grad_norm, fp32_square_grad_norm, fp32_scale, fp16_scale, clip_scale); - VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place); + if (fp32_scale) { + VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place); + } else { + VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place); + } if (num_devices > 1) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32, @@ -1218,36 +1275,56 @@ class DistributedFusedLambOpKernel VLOG(10) << "ReduceScatter done"; // Step 7: update the moment1, moment2. Calcuate the trust_ratio_div + auto *fused_offsets_t = ctx.Input("FusedParamOffsets"); + auto *fused_offsets = fused_offsets_t->data(); + auto *fp32_partial_fused_offsets_t = + ctx.Input("FP32ShardFusedParamOffsets"); + const auto *fp32_partial_fused_offsets = + fp32_partial_fused_offsets_t->data(); + auto *fp16_partial_fused_offsets_t = + ctx.Input("FP16ShardFusedParamOffsets"); + const auto *fp16_partial_fused_offsets = + fp16_partial_fused_offsets_t->data(); + + VLOG(1) << "FusedParamOffsets: " + << FlattenToString(fused_offsets, fused_offsets_t->numel(), + fused_offsets_t->place()); + VLOG(1) << "FP32ShardFusedParamOffsets: " + << FlattenToString(fp32_partial_fused_offsets, + fp32_partial_fused_offsets_t->numel(), + fp32_partial_fused_offsets_t->place()); + VLOG(1) << "FP16ShardFusedParamOffsets: " + << FlattenToString(fp16_partial_fused_offsets, + fp16_partial_fused_offsets_t->numel(), + fp16_partial_fused_offsets_t->place()); + memory::Buffer trust_ratio_div_buffer(place); auto *trust_ratio_div = trust_ratio_div_buffer.Alloc(partial_numel); auto fp32_offset = rank * fp32_numel_each_device; auto fp16_offset = rank * fp16_numel_each_device; if (has_fp32_param) { - auto config = - platform::GetGpuLaunchConfig1D(dev_ctx, fp32_numel_each_device); VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts"; - UpdateLambMoment<<>>( + MultiTensorUpdateLambMomentAndTrustRatioDiv( + dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num, fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm, - global_scale, indices + fp32_offset, weight_decay, beta1pow, beta2pow, - moment1, moment2, trust_ratio_div, beta1, beta2, epsilon, - max_global_grad_norm, fp32_numel_each_device, rescale_grad); + global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div, + found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2, + epsilon, max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP32 Moment and TrustRatioDiv done"; } float *master_param = nullptr; if (has_fp16_param) { master_param = fp32_param + fp32_numel; - auto config = - platform::GetGpuLaunchConfig1D(dev_ctx, fp16_numel_each_device); VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts"; - UpdateLambMoment<<>>( + auto tmp_found_inf = has_fp32_param ? nullptr : found_inf; + MultiTensorUpdateLambMomentAndTrustRatioDiv( + dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num, master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm, - global_scale, indices + fp32_numel + fp16_offset, weight_decay, - beta1pow, beta2pow, moment1 + fp32_numel_each_device, + global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device, moment2 + fp32_numel_each_device, - trust_ratio_div + fp32_numel_each_device, beta1, beta2, epsilon, - max_global_grad_norm, fp16_numel_each_device, rescale_grad); + trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay, + fp16_weight_decay_end_idx, beta1, beta2, epsilon, + max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP16 Moment and TrustRatioDiv done"; } @@ -1257,30 +1334,6 @@ class DistributedFusedLambOpKernel memory::Buffer square_norm_buffer(place); auto *param_square_norm = square_norm_buffer.Alloc(2 * param_num); auto *trust_ratio_div_square_norm = param_square_norm + param_num; - - auto *fused_offsets_t = ctx.Input("FusedParamOffsets"); - auto *fused_offsets = fused_offsets_t->data(); - auto *fp32_partial_fused_offsets_t = - ctx.Input("FP32ShardFusedParamOffsets"); - const auto *fp32_partial_fused_offsets = - fp32_partial_fused_offsets_t->data(); - auto *fp16_partial_fused_offsets_t = - ctx.Input("FP16ShardFusedParamOffsets"); - const auto *fp16_partial_fused_offsets = - fp16_partial_fused_offsets_t->data(); - - VLOG(1) << "FusedParamOffsets: " - << FlattenToString(fused_offsets, fused_offsets_t->numel(), - fused_offsets_t->place()); - VLOG(1) << "FP32ShardFusedParamOffsets: " - << FlattenToString(fp32_partial_fused_offsets, - fp32_partial_fused_offsets_t->numel(), - fp32_partial_fused_offsets_t->place()); - VLOG(1) << "FP16ShardFusedParamOffsets: " - << FlattenToString(fp16_partial_fused_offsets, - fp16_partial_fused_offsets_t->numel(), - fp16_partial_fused_offsets_t->place()); - if (num_devices > 1) { if (use_master_param_norm) { FillZeroWithPtr(param_square_norm + fp32_global_param_num, @@ -1296,11 +1349,11 @@ class DistributedFusedLambOpKernel fp16_partial_fused_offsets, fp16_local_param_num, param_square_norm + fp16_local_start_idx); } else { - // NOTE: extra computation is performed. We can improve this performance - // if needed in the future. MultiTensorL2Norm( - place, stream, fp16_param, fused_offsets + fp32_global_param_num, - fp16_global_param_num, param_square_norm + fp32_global_param_num); + place, stream, fp16_param + fused_offsets[fp16_local_start_idx] - + fused_offsets[fp32_global_param_num], + fused_offsets + fp16_local_start_idx, fp16_local_param_num, + param_square_norm + fp16_local_start_idx); } MultiTensorL2Norm(place, stream, trust_ratio_div, @@ -1333,26 +1386,29 @@ class DistributedFusedLambOpKernel // Step 9: update parameter, beta1pow, beta2pow. All gather parameters. if (has_fp32_param) { - LambUpdateParamAndBetaPows( - dev_ctx, trust_ratio_div, lr, indices + fp32_offset, - param_square_norm, trust_ratio_div_square_norm, fp32_square_grad_norm, - &beta1pow, &beta2pow, &found_inf, beta1, beta2, - fp32_numel_each_device, fp32_param + fp32_offset, nullptr, stream); + MultiTensorUpdateLambParamAndBetaPows( + dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num, + trust_ratio_div, lr, param_square_norm + fp32_local_start_idx, + trust_ratio_div_square_norm + fp32_local_start_idx, found_inf, + fp32_param + fp32_offset, nullptr, beta1pow, beta2pow, beta1, beta2); if (num_devices > 1) { // ncclAllGather PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( fp32_param + fp32_offset, fp32_param, fp32_numel_each_device, ncclFloat32, comm, stream)); } + + beta1pow = nullptr; + beta2pow = nullptr; } if (has_fp16_param) { - LambUpdateParamAndBetaPows( - dev_ctx, trust_ratio_div + fp32_numel_each_device, lr, - indices + fp32_numel + fp16_offset, param_square_norm, - trust_ratio_div_square_norm, fp32_square_grad_norm, &beta1pow, - &beta2pow, &found_inf, beta1, beta2, fp16_numel_each_device, - fp16_param + fp16_offset, master_param + fp16_offset, stream); - + MultiTensorUpdateLambParamAndBetaPows( + dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num, + trust_ratio_div + fp32_numel_each_device, lr, + param_square_norm + fp16_local_start_idx, + trust_ratio_div_square_norm + fp16_local_start_idx, found_inf, + fp16_param + fp16_offset, master_param + fp16_offset, beta1pow, + beta2pow, beta1, beta2); if (num_devices > 1) { // ncclAllGather PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h index 5d8d03c733dae210e8a41a8ad78a258df558b341..179e8f452545c437e373e42d59d18f524f260cd5 100644 --- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h +++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h @@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel( args...); } -template -static void MultiTensorApply(Functor functor, gpuStream_t stream, - const int *offsets, int n, int chunk_size, - Args... args) { +template +class MultiTensorLauncher { + public: + MultiTensorLauncher( + const TensorMetaList &meta, + const int &chunk_id, const int &chunk_size, const int &block_dim, + const gpuStream_t &stream) + : meta_(meta), + chunk_id_(chunk_id), + chunk_size_(chunk_size), + block_dim_(block_dim), + stream_(stream) {} + + template + void Launch(Functor &&functor, Args &&... args) const { + MultiTensorApplyCUDAKernel< + Functor, MaxTensorNumPerLaunch, + MaxChunkNumPerLaunch><<>>( + functor, meta_, chunk_size_, args...); + } + + private: + const TensorMetaList &meta_; + const int &chunk_id_; + const int &chunk_size_; + const int &block_dim_; + const gpuStream_t &stream_; +}; + +template +static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets, + int n, int chunk_size, int block_dim, + Callback &&callback) { if (n == 0) return; constexpr auto NumTensor = MaxTensorNumPerLaunch; @@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, int numel_offset = 0; metas.start_tensor_id = 0; metas.start_chunk_id = 0; + int launch_num = 0; + + MultiTensorLauncher launcher( + metas, chunk_id, chunk_size, block_dim, stream); + for (int i = 0; i < n; ++i) { auto length = offsets[i + 1] - offsets[i]; if (tensor_id == 0) { @@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, bool last_chunk = (i + 1 == n && j + 1 == chunk_num); if (tensor_full || block_full || last_chunk) { - MultiTensorApplyCUDAKernel<<>>( - functor, metas, chunk_size, args...); + callback(launcher, launch_num); + ++launch_num; chunk_id = 0; if (j + 1 == chunk_num) { // chunk for the current tensor is full metas.start_chunk_id = 0; @@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, } } +template +static void MultiTensorApply(Functor functor, gpuStream_t stream, + const int *offsets, int n, int chunk_size, + int block_dim, Args &&... args) { + auto callback = [&](const MultiTensorLauncher &launcher, + int i) { launcher.Launch(functor, args...); }; + MultiTensorApplyWithCallback( + stream, offsets, n, chunk_size, block_dim, callback); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index f2cb427a0a5b139e1ccdf960afeb6db4bcb8b5a5..d0b78b9b0643d6c5dc5b4bfeac2cf792ac349194 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) { __device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) { return static_cast(abs(static_cast(x))); } + +__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) { + return static_cast(abs(static_cast(x))); +} + __device__ __forceinline__ float inline_abs(float x) { return abs(x); } __device__ __forceinline__ double inline_abs(double x) { return abs(x); } @@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow( return static_cast( pow(static_cast(base), static_cast(exponent))); } +__device__ __forceinline__ platform::bfloat16 inline_pow( + platform::bfloat16 base, platform::bfloat16 exponent) { + return static_cast( + pow(static_cast(base), static_cast(exponent))); +} __device__ __forceinline__ float inline_pow(float base, float exponent) { return pow(base, exponent); } @@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(p_norm, ops::PnormCUDAKernel, + ops::PnormCUDAKernel, ops::PnormCUDAKernel, ops::PnormCUDAKernel); REGISTER_OP_CUDA_KERNEL( p_norm_grad, ops::PnormGradCUDAKernel, + ops::PnormGradCUDAKernel, ops::PnormGradCUDAKernel, ops::PnormGradCUDAKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index c3d3e0cf6ecd51f3bb2baa063878f80444db3563..2f6bf127518090916c4b947daf1d1f202fdd5960 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -23,6 +23,7 @@ REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel>, CUDAReduceSumGradKernel>); diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index a8f05d94563e57a20cc41ba1edd68872d869d00e..6678320f9ffa61e3e6c51fd806569c2571d63d69 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -15,6 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/split_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { using framework::Tensor; @@ -23,52 +26,6 @@ class SplitOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of SplitOp should not be null.")); - PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, - platform::errors::InvalidArgument( - "Outputs(Out) of SplitOp should not be empty.")); - auto in_dims = ctx->GetInputDim("X"); - auto outs_names = ctx->Outputs("Out"); - size_t axis = static_cast(ctx->Attrs().Get("axis")); - size_t num = static_cast(ctx->Attrs().Get("num")); - std::vector sections = static_cast>( - ctx->Attrs().Get>("sections")); - const size_t outs_number = outs_names.size(); - - if (sections.size() > 0) { - PADDLE_ENFORCE_EQ( - sections.size(), outs_number, - platform::errors::InvalidArgument("tensor split sections size " - "should be equal to output size.")); - } - - if (ctx->HasInput("AxisTensor")) { - auto out_dims = phi::make_ddim(std::vector(in_dims.size(), -1)); - std::vector outs_dims(outs_number, out_dims); - ctx->SetOutputsDim("Out", outs_dims); - for (size_t i = 0; i < outs_number; ++i) { - ctx->ShareLoD("X", "Out", 0, i); - } - return; - } - - bool each_section_is_known = - (sections.size() > 0 && !ctx->HasInputs("SectionsTensorList")); - - auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known, - in_dims, num, sections, axis, outs_number); - ctx->SetOutputsDim("Out", outs_dims); - if (axis != 0) { - // Only pass LoD when not spliting along the first dim. - for (size_t i = 0; i < outs_number; ++i) { - ctx->ShareLoD("X", "Out", 0, i); - } - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -168,6 +125,10 @@ Example: namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor, + PT_INFER_META(phi::SplitInferMeta)); + REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker, - ops::SplitGradMaker); + ops::SplitGradMaker, + SplitInferShapeFunctor); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 3e2d2a5495b3428ce0fad9d61431d53b44eea330..33590c1d7cca04e215e55abb26fb2aa3c3b61bec 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL( ops::SumKernel, ops::SumKernel, ops::SumKernel, - ops::SumKernel); + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 353d653f48141b2e68db6143c1ca0859a9ecc13f..1c22e60fa87aa73246806e4f5bc70e49a3b0f958 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -281,10 +281,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::operators::UniformRandomOpVarTypeInference); -REGISTER_OP_CPU_KERNEL( - uniform_random, paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel); REGISTER_OP_CPU_KERNEL( uniform_random_batch_size_like, paddle::operators::CPUUniformRandomKernel, diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index fb38a6aded4cf173bb4c0dd96d131ff520b6701e..2ceb8a68d863dfe71458c67deeac7f54df0a662b 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL(uniform_random, - paddle::operators::GPUUniformRandomKernel, - paddle::operators::GPUUniformRandomKernel); REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like, paddle::operators::GPUUniformRandomKernel, paddle::operators::GPUUniformRandomKernel); diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc index 92ed2bbdc33f55315b3dddf8dc106b7716e97a6f..0f10efefa137b698b59db23b67122df990cfa366 100644 --- a/paddle/fluid/operators/where_op.cc +++ b/paddle/fluid/operators/where_op.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/where_op.h" - +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "Where"); - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Where"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Where"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Where"); - - auto cond_dims = ctx->GetInputDim("Condition"); - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_EQ( - cond_dims, x_dims, - platform::errors::InvalidArgument( - "The dims of Inputs(Condition) and Inputs(X) should be same. " - "But received Condition's shape is [%s], X's shape is [%s]", - cond_dims, x_dims)); - PADDLE_ENFORCE_EQ(x_dims, y_dims, - platform::errors::InvalidArgument( - "The dims of Inputs(X) and Inputs(Y) should be same. " - "But received X's shape is [%s], Y's shape is [%s]", - x_dims, y_dims)); - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y"); } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor, + PT_INFER_META(phi::WhereInferMeta)); REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker, ops::WhereOpGradMaker, - ops::WhereOpGradMaker); + ops::WhereOpGradMaker, + WhereInferShapeFunctor); REGISTER_OPERATOR(where_grad, ops::WhereGradOp, ops::WhereGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - where, ops::WhereKernel, - ops::WhereKernel, - ops::WhereKernel, - ops::WhereKernel); -REGISTER_OP_CPU_KERNEL( - where_grad, ops::WhereGradKernel, - ops::WhereGradKernel, - ops::WhereGradKernel, - ops::WhereGradKernel); diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu deleted file mode 100644 index 61a1691e4fe265035917ed2407d5e3e24aa6bd88..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/where_op.cu +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/where_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" - -namespace platform = paddle::platform; - -namespace paddle { -namespace operators { - -template -struct CondFunctor { - HOSTDEVICE inline CondFunctor() {} - - HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const { - return cond ? x : y; - } -}; - -template -__global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x, - const T* y, T* out) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < N; idx += blockDim.x * gridDim.x) { - out[idx] = cond[idx] ? x[idx] : y[idx]; - } -} - -template -__global__ void WhereGradCUDAKernel(const int N, const T* dout, - const bool* cond, T* dx, T* dy) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < N; idx += blockDim.x * gridDim.x) { - if (dx != nullptr) { - dx[idx] = cond[idx] ? dout[idx] : 0.; - } - if (dy != nullptr) { - dy[idx] = cond[idx] ? 0. : dout[idx]; - } - } -} - -template -class WhereKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - auto* X = context.Input("X"); - auto* Y = context.Input("Y"); - auto* out = context.Output("Out"); - auto numel = condition->numel(); - - // TODO(GaaoWei8): Input of where can be broadcast - const bool* cond_data = condition->data(); - const T* x_data = X->data(); - const T* y_data = Y->data(); - T* out_data = out->mutable_data(context.GetPlace()); - - auto stream = context.cuda_device_context().stream(); - auto& dev_ctx = - context.template device_context(); - auto functor = CondFunctor(); - std::vector ins = {condition, X, Y}; - std::vector outs = {out}; - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -template -class WhereGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - const bool* cond_data = condition->data(); - auto numel = condition->numel(); - - auto* dout_t = - context.Input(framework::GradVarName("Out")); - auto* dx_t = context.Output(framework::GradVarName("X")); - auto* dy_t = context.Output(framework::GradVarName("Y")); - auto* dout = dout_t->data(); - T* dx = - (dx_t != nullptr) ? dx_t->mutable_data(context.GetPlace()) : nullptr; - T* dy = - (dy_t != nullptr) ? dy_t->mutable_data(context.GetPlace()) : nullptr; - - auto stream = context.cuda_device_context().stream(); - auto& dev_ctx = - context.template device_context(); - auto config = GetGpuLaunchConfig1D(dev_ctx, condition->numel()); - WhereGradCUDAKernel< - T><<>>( - numel, dout, cond_data, dx, dy); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - where, paddle::operators::WhereKernel, - paddle::operators::WhereKernel, - paddle::operators::WhereKernel, - paddle::operators::WhereKernel); -REGISTER_OP_CUDA_KERNEL( - where_grad, - paddle::operators::WhereGradKernel, - paddle::operators::WhereGradKernel, - paddle::operators::WhereGradKernel, - paddle::operators::WhereGradKernel); diff --git a/paddle/fluid/operators/where_op.h b/paddle/fluid/operators/where_op.h deleted file mode 100644 index 5398ee024a2890e38e88fc981721872e1ba34d60..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/where_op.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class WhereKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - auto* X = context.Input("X"); - auto* Y = context.Input("Y"); - auto* out = context.Output("Out"); - - const bool* cond_data = condition->data(); - const T* x_data = X->data(); - const T* y_data = Y->data(); - T* out_data = out->mutable_data(context.GetPlace()); - - auto x_numel = X->numel(); - for (int i = 0; i < x_numel; i++) { - out_data[i] = cond_data[i] ? x_data[i] : y_data[i]; - } - } -}; - -template -class WhereGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - const auto* cond_data = condition->data(); - auto numel = condition->numel(); - - auto* dout_t = - context.Input(framework::GradVarName("Out")); - auto* dx_t = context.Output(framework::GradVarName("X")); - auto* dy_t = context.Output(framework::GradVarName("Y")); - - auto* dout = dout_t->data(); - if (dx_t != nullptr) { - auto* dx = dx_t->mutable_data(context.GetPlace()); - for (int i = 0; i < numel; i++) { - dx[i] = dout[i] * (cond_data[i] ? 1. : 0.); - } - } - if (dy_t != nullptr) { - auto* dy = dy_t->mutable_data(context.GetPlace()); - for (int i = 0; i < numel; i++) { - dy[i] = dout[i] * (cond_data[i] ? 0. : 1.); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc index d4294393daa34612aae815b0ebfab7d55f0b9f46..35508950941783753734a916aa7c2dcff7731181 100755 --- a/paddle/fluid/operators/where_op_npu.cc +++ b/paddle/fluid/operators/where_op_npu.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/where_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/where_op_xpu.cc b/paddle/fluid/operators/where_op_xpu.cc index 3a4875c07005119e90f5d5cb448a63bcf62a09a4..41232c8b5e8d88564e59e0343a26a4ae98d5ed90 100644 --- a/paddle/fluid/operators/where_op_xpu.cc +++ b/paddle/fluid/operators/where_op_xpu.cc @@ -14,7 +14,7 @@ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/where_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 8aec8e840f33273a3130355c751e635e4a3f6736..803674779e756f000005d106f950659ea765c5ce 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #endif #include +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" @@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock( #endif #endif +// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16. +inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) { + bfloat16 low_half; + // the bfloat16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(static_cast(low_half) + x); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) { + bfloat16 high_half; + // the bfloat16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = static_cast(static_cast(high_half) + x); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) { + return *reinterpret_cast(&x); +} + +static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) { + return *reinterpret_cast<__nv_bfloat16 *>(&x); +} + +CUDA_ATOMIC_WRAPPER(Add, bfloat16) { + return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address), + PDBF16ToCUDABF16(val))); +} +#else +CUDA_ATOMIC_WRAPPER(Add, bfloat16) { + // concrete packed bfloat16 value may exsits in lower or higher 16bits + // of the 32bits address. + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t sum; + uint32_t newval; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // the bfloat16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, + bf16_add_to_low_half(assumed, val_f)); + } while (old != assumed); + bfloat16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // the bfloat16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, + bf16_add_to_high_half(assumed, val_f)); + } while (old != assumed); + bfloat16 ret; + ret.x = old >> 16; + return ret; + } +} +#endif + CUDA_ATOMIC_WRAPPER(Add, complex) { float *real = reinterpret_cast(address); float *imag = real + 1; diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 1f06eda8a2ee5dc8322b5e16e1f7eb2e0703f9a8..c61e8212b0257cc5ccffaa27971b959472a71f06 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -81,7 +81,7 @@ set(PYBIND_SRCS cuda_streams_py.cc) if(NOT ON_INFER) - set (PYBIND_DEPS ${PYBIND_DEPS} processgroup) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer) if (WITH_NCCL) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) endif() diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index e057fb53ccecc7193fd52b8beda2c4f2880560e8..7b59188a9f3cdae2d0e9df329b969395b50177b0 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/distributed/collective/Types.h" +#include "paddle/fluid/distributed/collective/reducer.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/imperative/layer.h" @@ -143,6 +144,19 @@ void BindDistributed(py::module *m) { [](distributed::ProcessGroupStrategy &self, int nrings) { self.nrings_ = nrings; }); + + m->def("eager_assign_group_by_size", + [](py::handle py_tensors, std::vector is_sparse_gradient, + std::vector group_size_limits, + std::vector tensor_indices) { + auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); + return distributed::Eager_AssignGroupBySize( + tensors, is_sparse_gradient, group_size_limits, tensor_indices); + }, + py::arg("tensors"), py::arg("is_sparse_gradient"), + py::arg("group_size_limits") = std::vector{25 * 1024 * 1024}, + py::arg("tensor_indices") = std::vector{}, + py::call_guard()); } } // end namespace pybind diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 2b07a439d33b4a96a10a893a95e0dd26f83dd8c7..d23b3dd64ab05cf10d8096a84e317645972211d1 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -60,7 +60,8 @@ std::map> op_ins_map = { {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}}, {"merged_momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}}, - {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, + {"sparse_momentum", + {"Param", "Grad", "Velocity", "Index", "LearningRate", "MasterParam"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, {"run_program", {"X", "Params"}}, {"fused_feedforward", @@ -124,7 +125,7 @@ std::map> op_outs_map = { {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, - {"sparse_momentum", {"ParamOut", "VelocityOut"}}, + {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"rnn", {"DropoutState", "Reserve", "Out", "State"}}, {"run_program", {"DOut"}}, {"adam", @@ -181,7 +182,7 @@ std::map> op_passing_outs_map = { "out_old_num_accumulates", "out_num_updates"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, - {"sparse_momentum", {"ParamOut", "VelocityOut"}}, + {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"batch_norm", {"MeanOut", "VarianceOut"}}, {"sync_batch_norm", {"MeanOut", "VarianceOut"}}, {"accuracy", {"Correct", "Total"}}, diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td index 81d3d028a66bea29dd9a373e1905ac02468251fd..978b126d754169e4f57fdd3b79fe49855c5d3359 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops_base.td +++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td @@ -40,6 +40,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> { ); } +// Type Constrait for concrete DenseTensor type. +class DenseTensor : + Type, + "!infrt.DenseTensor<"#target#","#precision#","#layout#">", + "::infrt::DenseTensorType">; + // Base class for infrt dialect attributes. class Infrt_Attr traits = [], string baseCppClass = "::mlir::Attribute"> diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc index b5b8de7a20d0866802b8ce72e12dd7ed35dccbd1..c5c81b4b0f22dd369d7b63d34f45c41897052185 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.cc +++ b/paddle/infrt/dialect/init_infrt_dialects.cc @@ -21,8 +21,8 @@ #include "paddle/infrt/dialect/infrt/infrt_dialect.h" #include "paddle/infrt/dialect/infrt_base.h" #include "paddle/infrt/dialect/pd_ops.h" -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h" -#include "paddle/infrt/dialect/phi/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/tensor_shape.h" namespace infrt { diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt index 626b02c1f790d0a7f38887be33dace1c773a2cb1..d477b6b9bdc278b2408794fa4235d9c8bca5850a 100644 --- a/paddle/infrt/dialect/phi/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/CMakeLists.txt @@ -2,16 +2,7 @@ if (NOT INFRT_WITH_PHI) return() endif() -#mlir_tablegen_on(infrt_phi_base DIALECT phi) -add_mlir_dialect(infrt_phi_base phi) -add_mlir_dialect(infrt_phi_tensor phi_dt) -add_mlir_dialect(infrt_phi_kernel phi_kernel) -#mlir_tablegen_on(infrt_phi_tensor) - -gather_srcs(infrt_src SRCS - phi_base.cc infrt_phi_tensor.cc - infrt_phi_tensor.cc) - +add_subdirectory(ir) add_subdirectory(pass) add_executable(phi-exec phi_exec.cc) diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c1d75629d09c210f813cd994199da77ca48a3b8 --- /dev/null +++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt @@ -0,0 +1,9 @@ +#mlir_tablegen_on(infrt_phi_base DIALECT phi) +add_mlir_dialect(infrt_phi_base phi) +add_mlir_dialect(infrt_phi_tensor phi_dt) +add_mlir_dialect(infrt_phi_kernel phi_kernel) +#mlir_tablegen_on(infrt_phi_tensor) + +gather_srcs(infrt_src SRCS + phi_base.cc + infrt_phi_tensor.cc) diff --git a/paddle/infrt/dialect/phi/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td similarity index 100% rename from paddle/infrt/dialect/phi/infrt_phi_base.td rename to paddle/infrt/dialect/phi/ir/infrt_phi_base.td diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td similarity index 92% rename from paddle/infrt/dialect/phi/infrt_phi_kernel.td rename to paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td index 879994907cc0d951bde838b23fd129e865a360f2..37bf0b5ef213d76613162aa9bb3d2f9b6324340e 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_kernel.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td @@ -4,7 +4,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/infrt_base.td" -include "paddle/infrt/dialect/phi/infrt_phi_base.td" +include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" def PHI_KernelDialect : Dialect { let name = "phi_kernel"; diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc similarity index 71% rename from paddle/infrt/dialect/phi/infrt_phi_tensor.cc rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc index 9df1a47031b1f726578291f628cda7d12900bcb7..64780294be92b86bcf29d3cb2045434cc6479517 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include -#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc" -#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.cpp.inc" namespace infrt { namespace phi { @@ -25,7 +25,7 @@ namespace phi { void PHIDenseTensorDialect::initialize() { #define GET_OP_LIST addOperations< -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc" >(); } @@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() { } // namespace infrt #define GET_OP_CLASSES -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h similarity index 83% rename from paddle/infrt/dialect/phi/infrt_phi_tensor.h rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h index 2780f9759185ef45bc19f43fc621f46eabbe7a66..9a92558daab0376d430fe04b853a810cf42b6e85 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_tensor.h +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h @@ -29,11 +29,11 @@ #include #include -#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc" -#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc" #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/phi/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" // NOLINT #define GET_OP_CLASSES -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc" diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td similarity index 97% rename from paddle/infrt/dialect/phi/infrt_phi_tensor.td rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index b7b3b061fdbe42909ac503d9d387cb8aed6bdc1a..dc3a4b340d767a371bc411c0a58d1fc7c72ca83e 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -2,7 +2,7 @@ #else #define PHI_TENSOR -include "paddle/infrt/dialect/phi/infrt_phi_base.td" +include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/infrt_base.td" diff --git a/paddle/infrt/dialect/phi/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc similarity index 84% rename from paddle/infrt/dialect/phi/phi_base.cc rename to paddle/infrt/dialect/phi/ir/phi_base.cc index a1caa40f6383b5016a9e237733a0b3ef016cbc97..7a6b3f3f0a404043f49a6df3e5bdcb873dd442c9 100644 --- a/paddle/infrt/dialect/phi/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/phi/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" #include #include @@ -21,8 +21,8 @@ #include #include #include "paddle/infrt/common/global.h" -#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" -#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc" namespace infrt { namespace phi { @@ -51,11 +51,11 @@ void PHIDialect::printType(::mlir::Type type, void PHIDialect::initialize() { addOperations< #define GET_OP_LIST -#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" // NOLINT >(); addTypes< #define GET_TYPEDEF_LIST -#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT >(); } @@ -81,4 +81,4 @@ mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const { } // namespace infrt #define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/phi/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h similarity index 84% rename from paddle/infrt/dialect/phi/phi_base.h rename to paddle/infrt/dialect/phi/ir/phi_base.h index 11174290f92bd18fdc91588d7eba89f61bb05413..a08d8229fccf53225311b451e941f99e8a3d0e8a 100644 --- a/paddle/infrt/dialect/phi/phi_base.h +++ b/paddle/infrt/dialect/phi/ir/phi_base.h @@ -19,11 +19,13 @@ #include -#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc" -#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc" #define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc" + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc" namespace mlir { namespace OpTrait { diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index eb9a2092657aa079ee6a4007d7ded9f8896e93aa..7e7d77d3af741443d490dcfdd5b9ee6677b557ef 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -73,7 +73,7 @@ using ValueVariantType = std::vector, paddle::experimental::ScalarBase, paddle::experimental::ScalarArrayBase, - std::vector, + std::vector, phi::MetaConfig, paddle::experimental::Backend, paddle::experimental::DataLayout, diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index c7400b93fcdc18314318fae9482e1e5e5bfb8aef..19b113838eab5403aca00d9d97b278646228c512 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -94,12 +94,16 @@ std::vector split_impl(const Tensor& x, std::vector out; auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out); std::vector meta_outs; + meta_outs.reserve(out_number); + std::vector meta_out_ptrs; + meta_out_ptrs.reserve(out_number); for (size_t i = 0; i < out_number; ++i) { meta_outs.push_back(dense_outs[i]); + meta_out_ptrs.push_back(&meta_outs.back()); } phi::SplitInferMeta( - MakeMetaTensor(*dense_x), num_or_sections, axis, &meta_outs); + MakeMetaTensor(*dense_x), num_or_sections, axis, meta_out_ptrs); using kernel_signature = void (*)(const platform::DeviceContext&, const phi::DenseTensor&, diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc index 31325e22afae31e55a3a2d939739d6745ccd3d36..1c9f7c3a8683daaf26cb87b23e50284d0329c4a8 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.cc +++ b/paddle/phi/api/lib/utils/tensor_utils.cc @@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList( return result; } -void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst, - const phi::TensorArgDef& arg_def) { - VLOG(5) << "ResetTensor by TensorArgDef."; - if (phi::DenseTensor::classof(dst)) { - auto* dense_t = static_cast(dst); - auto* meta = phi::DenseTensorUtils::GetMutableMeta(dense_t); - meta->dtype = arg_def.dtype; - meta->layout = arg_def.layout; - } else if (phi::SelectedRows::classof(dst)) { - auto* selected_rows = static_cast(dst); - auto* meta = - phi::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value()); - meta->dtype = arg_def.dtype; - meta->layout = arg_def.layout; - } else { - PADDLE_THROW(phi::errors::Unimplemented( - "Unsupported tensor type is received when reseting tensor dtype and " - "layout by argument definition.")); - } -} - } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h index 8b30d5421ab943d568a046ca0fe4698849780ffd..64df59c1a2a2de3f72ce46874fe07df70d33599e 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.h +++ b/paddle/phi/api/lib/utils/tensor_utils.h @@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable); phi::ScalarArray MakePhiScalarArrayFromVarList( const std::vector& variable_list); -void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst, - const phi::TensorArgDef& arg_def); - } // namespace experimental } // namespace paddle diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 603ce0817c4ebdcb17bb97b14dd0700badcf2385..b9d843982dc5ebb8312a4912ebfa96c73e22b6c5 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -227,4 +227,12 @@ class GPUContext : public DeviceContext { // must use different function name for cudnn kernel using GPUDNNContext = GPUContext; +// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend, +// because we want to implement a KPS-based kernel and make it run +// on GPU and XPU at the same time, so we need KPSContext when registering +// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +using KPSContext = GPUContext; +#endif + } // namespace phi diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index 3005d1707e638a346c0d20e83a808c5c0da334e1..b87489c567cabea137850163879ed00d151f60cb 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -66,4 +66,12 @@ class XPUContext : public DeviceContext { std::unique_ptr impl_; }; +// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend, +// because we want to implement a KPS-based kernel and make it run +// on GPU and XPU at the same time, so we need KPSContext when registering +// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! +#if PADDLE_WITH_XPU_KP +using KPSContext = XPUContext; +#endif + } // namespace phi diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 4b7bf65be39cbc83688e7dab3fdd745c2be82b22..a9e12f5d81ed08328afad9e7da6d1e1999d47be1 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -52,6 +52,9 @@ enum class Backend : uint8_t { MKLDNN, GPUDNN, // cuDNN and hipDNN + // paddle kernel primitives backend + KPS, + // end of backend types NUM_BACKENDS, @@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { case Backend::GPUDNN: os << "GPUDNN"; break; + case Backend::KPS: + os << "KPS"; + break; default: { size_t device_type_id_ = static_cast(backend) - static_cast(Backend::NUM_BACKENDS); @@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) { return Backend::MKLDNN; } else if (s == std::string("GPUDNN")) { return Backend::GPUDNN; + } else if (s == std::string("KPS")) { + return Backend::KPS; } else { return static_cast(static_cast(Backend::NUM_BACKENDS) + phi::GetOrRegisterGlobalDeviceTypeId(s)); diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index f4f57a0acbbb386a3642a05e0d0dc70cd082a4d8..8ffacbb39bb249c57fb5c9ef1462d03747356f96 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -22,8 +22,8 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_ cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) +cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils) diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 3b7a733ede90464328600ebd3c7d371314b99cc3..b85db07bd9dfa0d798304aac6bd86089a9f0b4c0 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { case phi::Backend::XPU: return phi::XPUPlace( set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); +#endif + case phi::Backend::KPS: +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + return phi::GPUPlace( + set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); +#elif defined(PADDLE_WITH_XPU_KP) + return phi::XPUPlace( + set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); #endif default: { #ifdef PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc index f84a2bd8d9c5d0634f29485fc07f649ea9fb1b9e..58f9e1c623e81b4f2877099d1cdc2a8fe2e18b9e 100644 --- a/paddle/phi/core/custom_kernel.cc +++ b/paddle/phi/core/custom_kernel.cc @@ -20,16 +20,16 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { auto& kernel_info_map = custom_kernel_map.GetMap(); VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size(); + auto& kernels = KernelFactory::Instance().kernels(); for (auto& pair : kernel_info_map) { - PADDLE_ENFORCE_EQ( - KernelFactory::Instance().HasCompatiblePhiKernel(pair.first), - true, + PADDLE_ENFORCE_NE( + kernels.find(pair.first), + kernels.end(), phi::errors::InvalidArgument( "The kernel %s is not ready for custom kernel registering.", pair.first)); for (auto& info_pair : pair.second) { - auto& kernels = KernelFactory::Instance().kernels(); PADDLE_ENFORCE_EQ( kernels[pair.first].find(info_pair.first), kernels[pair.first].end(), diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc index 29e7dc01f32db20e3756677fe8a48fcb138b3883..5ee83089589e89b3cb29f095bd88fb16ff39d296 100644 --- a/paddle/phi/core/dense_tensor_impl.cc +++ b/paddle/phi/core/dense_tensor_impl.cc @@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) { // Note: When you reset holder, you need to ensure the offset is correct void DenseTensor::ResetHolder(const std::shared_ptr& holder) { if (holder_) { - // TODO(zyfncg): The change of static_cast<> in check will recover back - // when SetAllocationForOutputTenosr is deleted. - // Now the numel() may return -1, and will cast to a very large number when - // compare with a data with unsigned long type, this will make checking - // failed, so it's a temporary solution to deal with this problem. PADDLE_ENFORCE_LE( numel() * static_cast(SizeOf(dtype())) + static_cast(meta_.offset), diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc index f3dd056911ecf81d5ca0954114acbd1a3ac19ad9..671ba2ec7dc258865c01fff99ce97aacaeddd3cc 100644 --- a/paddle/phi/core/infermeta_utils.cc +++ b/paddle/phi/core/infermeta_utils.cc @@ -75,13 +75,13 @@ paddle::optional InferMetaContext::OptionalInputAt( : paddle::optional{paddle::none}; } -std::vector InferMetaContext::InputsBetween(size_t start, - size_t end) const { - std::vector result; +std::vector InferMetaContext::InputsBetween(size_t start, + size_t end) const { + std::vector result; result.reserve(end - start); for (size_t i = start; i < end; ++i) { - result.emplace_back(*inputs_.at(i)); + result.push_back(inputs_.at(i).get()); } return result; @@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) { return outputs_.at(idx).get(); } -std::vector InferMetaContext::MutableOutputBetween(size_t start, - size_t end) { - std::vector result; +std::vector InferMetaContext::MutableOutputBetween(size_t start, + size_t end) { + std::vector result; result.reserve(end - start); for (size_t i = start; i < end; ++i) { - result.emplace_back(*outputs_.at(i)); + result.emplace_back(outputs_.at(i).get()); } return result; } diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h index 203dbb269841ec8616b94c89603af3904eb572c3..a5775db74382c1aeda95a4351842444b5ad1e47e 100644 --- a/paddle/phi/core/infermeta_utils.h +++ b/paddle/phi/core/infermeta_utils.h @@ -50,13 +50,13 @@ class InferMetaContext { const std::pair& OutputRangeAt(size_t idx) const; const MetaConfig& GetMetaConfig() const; - const MetaTensor& InputAt(size_t idx) const; + const MetaTensor& InputAt(size_t idx) const; paddle::optional OptionalInputAt(size_t idx) const; + std::vector InputsBetween(size_t start, size_t end) const; - std::vector InputsBetween(size_t start, size_t end) const; MetaTensor* MutableOutputAt(size_t idx); - std::vector MutableOutputBetween(size_t start, size_t end); + std::vector MutableOutputBetween(size_t start, size_t end); template AttrType AttrAt(size_t idx) { @@ -157,7 +157,7 @@ struct InferMetaFnImpl { }; template - struct InferMetaFnCallHelper&, Tail...> { + struct InferMetaFnCallHelper&, Tail...> { template static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { static_assert(attr_idx == 0, @@ -165,7 +165,7 @@ struct InferMetaFnImpl { static_assert(out_idx == 0, "InferMeta's Input should appear before Outputs."); const std::pair range = ctx->InputRangeAt(in_idx); - std::vector arg = + std::vector arg = ctx->InputsBetween(range.first, range.second); InferMetaFnCallHelper< Tail...>::template Call(ctx, @@ -210,13 +210,12 @@ struct InferMetaFnImpl { }; template - struct InferMetaFnCallHelper*, Tail...> { + struct InferMetaFnCallHelper, Tail...> { template static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { const std::pair range = ctx->OutputRangeAt(out_idx); - std::vector tmp = + std::vector arg = ctx->MutableOutputBetween(range.first, range.second); - std::vector* arg = &tmp; InferMetaFnCallHelper< Tail...>::template Call(ctx, pargs..., diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 6a1688947b986549e1feaf39cdf6c73749b0ff3a..7a05452cbebe08d16a4486a03923431a3e59cb81 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -87,13 +87,11 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); -#ifndef PADDLE_WITH_CUSTOM_KERNEL } else if (arg_type == std::type_index(typeid(const SelectedRows&))) { args_def->AppendInput(default_key.backend(), default_tensor_layout, default_key.dtype(), arg_type); -#endif } else if (arg_type == std::type_index(typeid(DenseTensor*))) { args_def->AppendOutput(default_key.backend(), default_tensor_layout, @@ -105,13 +103,11 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); -#ifndef PADDLE_WITH_CUSTOM_KERNEL } else if (arg_type == std::type_index(typeid(SelectedRows*))) { args_def->AppendOutput(default_key.backend(), default_tensor_layout, default_key.dtype(), arg_type); -#endif } else { // Attribute deal with // TODO(chenweihang): now here allow any types of attribute, maybe diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 2ce1c829ce81a57cfad7343e2007ebf75b85ea80..b582375155a1878c52fd8fe9fb13f6e715df7067 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -23,9 +23,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_context.h" -#ifndef PADDLE_WITH_CUSTOM_KERNEL #include "paddle/phi/core/selected_rows.h" -#endif #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/type_defs.h" @@ -223,9 +221,7 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); -#ifndef PADDLE_WITH_CUSTOM_KERNEL PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); -#endif PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor); @@ -260,9 +256,7 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor); -#ifndef PADDLE_WITH_CUSTOM_KERNEL PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows); -#endif PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor); diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h index 3d2da542c74176017492bdb9f567396f81308d6a..f4bd0be0b45b867b8ed98a5c50d2e3f58ea49780 100644 --- a/paddle/phi/core/tensor_meta.h +++ b/paddle/phi/core/tensor_meta.h @@ -23,13 +23,6 @@ limitations under the License. */ #include "paddle/utils/any.h" #include "paddle/utils/optional.h" -// Note: mixed_vector include many header now, LoD will be -// used on CUDA device? Can we use small_vector here? -// @zhanlve: Rollback to original LoD for now -#ifndef PADDLE_WITH_CUSTOM_KERNEL -#include "paddle/fluid/framework/mixed_vector.h" -#endif - namespace phi { using DDim = phi::DDim; diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index dfaabf7cae21ec9b91624211ce9b852148dd7cc2..675e68af74339b508f589a55a9c3cf3aed37cecb 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -225,6 +225,41 @@ void HuberLossInferMeta(const MetaTensor& input, out->share_lod(input); } +void IndexSampleInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config) { + auto input_dims = x.dims(); + PADDLE_ENFORCE_EQ(input_dims.size(), + 2, + errors::InvalidArgument( + "Inputs(X) shape of IndexSample op should be 2-D, but " + "got X's shape = [%s], please check X shape.", + input_dims)); + + auto index_dims = y.dims(); + PADDLE_ENFORCE_EQ( + index_dims.size(), + 2, + errors::InvalidArgument( + "Inputs(Index) shape of IndexSample op should be 2-D, but " + "got Index's shape [%s] , please check index shape.", + input_dims)); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(input_dims[0], + index_dims[0], + errors::InvalidArgument( + "Inputs(X)'s value of dimension 0 must same with " + "Inputs(Index)'s value of dimension 0, but " + "got %d of Inputs(X), and got %d of Inputs(Index), " + "please check Inputs shape.", + input_dims[0], + index_dims[0])); + } + out->set_dtype(x.dtype()); + out->set_dims(index_dims); + out->share_lod(y); +} void CrossInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, @@ -271,8 +306,7 @@ void CrossInferMeta(const MetaTensor& x, } void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { - auto in_dims = x.dims(); - out->set_dims(in_dims); + out->share_meta(x); } void BCELossInferMeta(const MetaTensor& input, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 02750482dccaabd53f360fcc361bfdc8e788b89e..a0140c9a5799f79af541b45847d5e44f982a3f58 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta, MetaTensor* residual, MetaConfig config = MetaConfig()); +void IndexSampleInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void CrossInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 7a0db3d5c17ee3cd40891601009a3841f603bb32..7634e5e01aca4cdaf7fb46399f9594897f2d0e36 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -84,7 +84,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } -void ConcatInferMeta(const std::vector& x, +void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, MetaConfig config) { @@ -93,10 +93,19 @@ void ConcatInferMeta(const std::vector& x, phi::errors::InvalidArgument( "The size of input meta vector should be greater" "than 0.")); + if (axis_scalar.FromTensor()) { + auto out_dims = + phi::make_ddim(std::vector(x.at(0)->dims().size(), -1)); + out->set_dims(out_dims); + out->set_dtype(x.at(0)->dtype()); + out->set_layout(x.at(0)->layout()); + out->share_lod(*x.at(0)); + return; + } int axis = axis_scalar.to(); // 1. calculate axis - int rank = x.at(0).dims().size(); + int rank = x.at(0)->dims().size(); PADDLE_ENFORCE_EQ( axis >= -rank && axis < rank, true, @@ -111,15 +120,42 @@ void ConcatInferMeta(const std::vector& x, // 2. calculate out dims std::vector x_dims; - for (auto& x_t : x) { - x_dims.push_back(x_t.dims()); + x_dims.reserve(x.size()); + for (const auto* x_t : x) { + x_dims.emplace_back(x_t->dims()); } phi::DDim out_dim = phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis); out->set_dims(out_dim); - out->set_dtype(x.at(0).dtype()); - out->set_layout(x.at(0).layout()); + out->set_dtype(x.at(0)->dtype()); + out->set_layout(x.at(0)->layout()); + out->share_lod(*x.at(0)); +} + +void WhereInferMeta(const MetaTensor& condition, + const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out) { + auto cond_dims = condition.dims(); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + PADDLE_ENFORCE_EQ( + cond_dims, + x_dims, + phi::errors::InvalidArgument( + "The dims of Inputs(Condition) and Inputs(X) should be same. " + "But received Condition's shape is [%s], X's shape is [%s]", + cond_dims, + x_dims)); + PADDLE_ENFORCE_EQ(x_dims, + y_dims, + phi::errors::InvalidArgument( + "The dims of Inputs(X) and Inputs(Y) should be same. " + "But received X's shape is [%s], Y's shape is [%s]", + x_dims, + y_dims)); + out->share_meta(x); } } // namespace phi diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index a5fb2a4cbddc33b97b31a26fa29293868808875a..2afb79daa355cc897e3bf4076003e9a41de8b96c 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -25,9 +25,13 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); -void ConcatInferMeta(const std::vector& x, +void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, MetaConfig config = MetaConfig()); +void WhereInferMeta(const MetaTensor& condition, + const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out); } // namespace phi diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 49fd0a343a470f2545fc563366256f4f92294297..4696187bd2382a9d81400a0fd088f9d0013ff506 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -459,8 +459,19 @@ void TransferLayoutInferMeta(const MetaTensor& x, void SplitInferMeta(const MetaTensor& x, const ScalarArray& num_or_sections, const Scalar& axis, - std::vector* out, + std::vector out, MetaConfig config) { + if (!config.is_runtime) { + if (axis.FromTensor() || num_or_sections.FromTensor()) { + auto out_dims = phi::make_ddim(std::vector(x.dims().size(), -1)); + for (auto* item : out) { + item->set_dims(out_dims); + item->share_lod(x); + } + return; + } + } + int axis_value = axis.to(); int rank = x.dims().size(); PADDLE_ENFORCE_EQ( @@ -475,27 +486,34 @@ void SplitInferMeta(const MetaTensor& x, axis_value = axis_value + rank; } + std::vector out_dims(out.size(), x.dims()); + auto input_axis_dim = x.dims().at(axis_value); auto num_or_sections_data = num_or_sections.GetData(); - // step1: get formated sections - std::vector sections; // num_or_sections is a number if (num_or_sections_data.size() == 1) { - int num = num_or_sections_data.at(0); + if (config.is_runtime || input_axis_dim > 0) { + int num = num_or_sections_data.at(0); + PADDLE_ENFORCE_EQ( + input_axis_dim % num, + 0, + phi::errors::InvalidArgument( + "The input's size along the split dimension " + "must be evenly divisible by Attr(num_or_sections). " + "But received Attr(num_or_sections) " + "= %d, input(X)'s shape = [%s], Attr(dim) = %d.", + num, + x.dims(), + axis_value)); - PADDLE_ENFORCE_EQ(input_axis_dim % num, - 0, - phi::errors::InvalidArgument( - "The input's size along the split dimension " - "must be evenly divisible by Attr(num_or_sections). " - "But received Attr(num_or_sections) " - "= %d, input(X)'s shape = [%s], Attr(dim) = %d.", - num, - x.dims(), - axis_value)); - - for (int i = 0; i < num; ++i) { - sections.push_back(input_axis_dim / num); + size_t out_axis_dim = input_axis_dim / num; + for (auto& out_dim : out_dims) { + out_dim[axis_value] = out_axis_dim; + } + } else { + for (auto& out_dim : out_dims) { + out_dim[axis_value] = -1; + } } } else { // num_or_sections is a sections @@ -503,10 +521,9 @@ void SplitInferMeta(const MetaTensor& x, int unknow_dim_idx = -1; int num_of_unknow = 0; int sum_of_section = 0; + std::vector sections = num_or_sections_data; for (size_t i = 0; i < num_or_sections_data.size(); ++i) { - sections.push_back(num_or_sections_data[i]); - if (num_or_sections_data[i] == unknow_dim_val) { num_of_unknow++; unknow_dim_idx = i; @@ -558,31 +575,22 @@ void SplitInferMeta(const MetaTensor& x, x.dims(), axis_value)); } - } - - // setp2: fill out dims - std::vector out_dims(sections.size(), x.dims()); - if (config.is_runtime || input_axis_dim > 0) { - for (size_t i = 0; i < sections.size(); ++i) { + for (size_t i = 0; i < out_dims.size(); ++i) { out_dims[i][axis_value] = sections[i]; } - } else { - for (size_t i = 0; i < sections.size(); ++i) { - out_dims[i][axis_value] = -1; - } } - for (size_t i = 0; i < sections.size(); ++i) { + for (size_t i = 0; i < out.size(); ++i) { if (axis_value != 0) { // Only pass LoD when not spliting along the first dim. - (*out)[i].set_dtype(x.dtype()); - (*out)[i].set_dims(out_dims[i]); - (*out)[i].set_layout(x.layout()); + out.at(i)->set_dtype(x.dtype()); + out.at(i)->set_dims(out_dims[i]); + out.at(i)->set_layout(x.layout()); } else { - (*out)[i].set_dtype(x.dtype()); - (*out)[i].set_dims(out_dims[i]); - (*out)[i].set_layout(x.layout()); - (*out)[i].share_lod(x); + out.at(i)->set_dtype(x.dtype()); + out.at(i)->set_dims(out_dims[i]); + out.at(i)->set_layout(x.layout()); + out.at(i)->share_lod(x); } } } diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 4fab1ec68ec1e71af5e55a9852cd68deccc09a7c..b3929b9d2b47f87ab0f7b42ed74c2881c076f7d9 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -107,7 +107,7 @@ void TransferLayoutInferMeta(const MetaTensor& x, void SplitInferMeta(const MetaTensor& x_meta, const ScalarArray& num_or_sections, const Scalar& axis, - std::vector* out, + std::vector out, MetaConfig config = MetaConfig()); void UnbindInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h index fbc4a86f5af611df3bd6b8f5101a3a2f26473c9d..f13667881468e15183c3d770df638641f1dc6ed0 100644 --- a/paddle/phi/kernels/concat_kernel.h +++ b/paddle/phi/kernels/concat_kernel.h @@ -31,13 +31,16 @@ DenseTensor Concat(const Context& dev_ctx, const std::vector& x, const Scalar& axis) { std::vector meta_x; + meta_x.reserve(x.size()); + std::vector meta_x_ptr; for (const auto& t : x) { meta_x.emplace_back(t); + meta_x_ptr.push_back(&meta_x.back()); } auto dense_out = phi::Empty(dev_ctx); MetaTensor meta_out(&dense_out); - ConcatInferMeta(meta_x, axis.to(), &meta_out, /*is_runtime=*/true); + ConcatInferMeta(meta_x_ptr, axis.to(), &meta_out, /*is_runtime=*/true); ConcatKernel(dev_ctx, x, axis, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc index 6ff7431f0c8c556770b54e1328251e5996850fc9..7a519aab0ad71e4cd20270b216bf65262cab8ba6 100644 --- a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/atan2_grad_kernel.h" +#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" + #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" PD_REGISTER_KERNEL(atan2_grad, CPU, diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc index eb38a6c90b7938ef16cf9d56dfdb93903cc3c6a1..df6f5f59ac0056f36749faec8a300c1b5a1da1c9 100644 --- a/paddle/phi/kernels/cpu/atan2_kernel.cc +++ b/paddle/phi/kernels/cpu/atan2_kernel.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/atan2_kernel.h" +#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" + #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" PD_REGISTER_KERNEL(atan2, CPU, diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc index 18bb8837b105d91e3e13a0a7519b08c9c47202c4..5c4202837c4487361f33b849df7d975e85f8490d 100644 --- a/paddle/phi/kernels/cpu/concat_kernel.cc +++ b/paddle/phi/kernels/cpu/concat_kernel.cc @@ -37,6 +37,7 @@ void ConcatKernel(const Context& dev_ctx, axis = phi::funcs::ComputeAxis(axis, x[0].dims().size()); std::vector x_dims; + x_dims.reserve(x.size()); for (size_t i = 0; i < x.size(); ++i) { x_dims.push_back(x[i].dims()); } @@ -97,9 +98,10 @@ void ConcatKernel(const Context& dev_ctx, } } else { std::vector inputs; + inputs.reserve(x.size()); for (size_t j = 0; j < x.size(); ++j) { if (x[j].numel() > 0) { - inputs.push_back(x[j]); + inputs.emplace_back(x[j]); } else { continue; } diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..006711ceef75edb7d9d3ed2530c0a5dda2b64993 --- /dev/null +++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_sample_grad_kernel.h" +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/kernel_registry.h" +namespace phi { +template +void IndexSampleGradInner(const Context& context, + const DenseTensor& out_grad, + const DenseTensor& index, + DenseTensor* x_grad) { + std::vector out_grad_vec; + std::vector index_vec; + paddle::framework::TensorToVector(out_grad, context, &out_grad_vec); + paddle::framework::TensorToVector(index, context, &index_vec); + + auto index_dims = index.dims(); + auto x_grad_dims = x_grad->dims(); + + auto value_length = x_grad_dims[1]; + auto index_length = index_dims[1]; + int index_ids_num = index.numel(); + + std::vector x_grad_vec(x_grad->numel(), 0); + + for (int i = 0; i < index_ids_num; i++) { + int b = floor(i / index_length); + PADDLE_ENFORCE_GE( + index_vec[i], + 0, + errors::InvalidArgument( + "Variable value (index) of OP(index_sample_grad) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + value_length, + index_vec[i])); + PADDLE_ENFORCE_LT( + index_vec[i], + value_length, + errors::InvalidArgument( + "Variable value (index) of OP(index_sample_grad) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + value_length, + index_vec[i])); + int v_i = b * value_length + static_cast(index_vec[i]); + x_grad_vec[v_i] += out_grad_vec[i]; + } + context.template Alloc(x_grad); + paddle::framework::TensorFromVector(x_grad_vec, context, x_grad); + x_grad->Resize(x_grad_dims); +} + +template +void IndexSampleGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* x_grad) { + auto index_type = index.dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType((DataType::INT64))))); + if (index_type == DataType::INT32) { + IndexSampleGradInner(ctx, out_grad, index, x_grad); + } else if (index_type == DataType::INT64) { + IndexSampleGradInner(ctx, out_grad, index, x_grad); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_sample_grad, + CPU, + ALL_LAYOUT, + phi::IndexSampleGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..21bf9faee13cfa4da271a7d1b1a9fe482a55da04 --- /dev/null +++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_sample_kernel.h" +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/kernel_registry.h" +namespace phi { +template +void IndexSampleInner(const Context &context, + const DenseTensor &input, + const DenseTensor &index, + DenseTensor *output) { + auto input_dims = input.dims(); + auto index_dims = index.dims(); + + int batch_size = input_dims[0]; + auto value_length = input_dims[1]; + auto index_length = index_dims[1]; + int index_ids_num = index.numel(); + + std::vector input_vec; + std::vector index_vec; + paddle::framework::TensorToVector(input, context, &input_vec); + paddle::framework::TensorToVector(index, context, &index_vec); + + std::vector res(index_ids_num); + for (int i = 0; i < index_ids_num; i++) { + int b = floor(i / index_length); + PADDLE_ENFORCE_GE( + index_vec[i], + 0, + errors::InvalidArgument( + "Variable value (index) of OP(index_sample) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + value_length, + index_vec[i])); + PADDLE_ENFORCE_LT( + index_vec[i], + value_length, + errors::InvalidArgument( + "Variable value (index) of OP(index_sample) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + value_length, + index_vec[i])); + + int v_i = b * value_length + static_cast(index_vec[i]); + T v = input_vec[v_i]; + VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i + << " value = " << v; + res[i] = v; + } + + auto ddim = phi::make_ddim({batch_size, index_length}); + context.template Alloc(output); + paddle::framework::TensorFromVector(res, context, output); + output->Resize(ddim); +} + +template +void IndexSampleKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + DenseTensor *out) { + ctx.template Alloc(out); + auto index_type = index.dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType((DataType::INT64))))); + if (index_type == DataType::INT32) { + IndexSampleInner(ctx, x, index, out); + } else if (index_type == DataType::INT64) { + IndexSampleInner(ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_sample, + CPU, + ALL_LAYOUT, + phi::IndexSampleKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d179e1e75f4fa98057f32737f09025ce1d6b2fb --- /dev/null +++ b/paddle/phi/kernels/cpu/logical_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/logical_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/funcs/logical_functor.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/platform/transform.h" + +namespace phi { + +#define DEFINE_LOGICAL_BINARY_KERNEL(type) \ + template \ + void Logical##type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + funcs::Logical##type##Functor binary_func; \ + ElementwiseCompute, T, bool>( \ + dev_ctx, x, y, -1, binary_func, out); \ + } + +DEFINE_LOGICAL_BINARY_KERNEL(And) +DEFINE_LOGICAL_BINARY_KERNEL(Or) +DEFINE_LOGICAL_BINARY_KERNEL(Xor) +#undef DEFINE_LOGICAL_BINARY_KERNEL + +template +void LogicalNotKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + auto* out_ptr = dev_ctx.template Alloc(out); + funcs::LogicalNotFunctor unary_func; + + paddle::platform::Transform trans; + trans(dev_ctx, x.data(), x.data() + x.numel(), out_ptr, unary_func); +} + +} // namespace phi + +#define REGISTER_LOGICAL_CPU_KERNEL(logical_and, func_type) \ + PD_REGISTER_KERNEL(logical_and, \ + CPU, \ + ALL_LAYOUT, \ + phi::Logical##func_type##Kernel, \ + float, \ + double, \ + bool, \ + int64_t, \ + int, \ + int8_t, \ + int16_t) {} + +REGISTER_LOGICAL_CPU_KERNEL(logical_and, And) +REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or) +REGISTER_LOGICAL_CPU_KERNEL(logical_not, Not) +REGISTER_LOGICAL_CPU_KERNEL(logical_xor, Xor) diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index 722681fb7bc3f9d9f75b92468b89931910dd532e..4acf9b02028f994c38144d716fdd56c6bbb6afa2 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -28,20 +28,6 @@ void SplitKernel(const Context& dev_ctx, const ScalarArray& num_or_sections, const Scalar& axis_scalar, std::vector outs) { - // need to infershape output - if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) { - std::vector out_metas; - for (size_t i = 0; i < outs.size(); ++i) { - out_metas.push_back(outs[i]); - } - - phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true); - - for (size_t i = 0; i < out_metas.size(); ++i) { - outs[i]->Resize(out_metas[i].dims()); - } - } - std::vector shape_refer; for (size_t j = 0; j < outs.size(); ++j) { dev_ctx.template Alloc(outs[j]); diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..8ec1d9683e15a92c7184d91005f85258cf1dd004 --- /dev/null +++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/uniform_random_kernel.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +inline void UniformRealDistribution(T *data, + const int64_t &size, + const float &min, + const float &max, + std::shared_ptr engine) { + std::uniform_real_distribution dist(static_cast(min), + static_cast(max)); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(*engine); + } +} + +template <> +inline void UniformRealDistribution(phi::dtype::bfloat16 *data, + const int64_t &size, + const float &min, + const float &max, + std::shared_ptr engine) { + std::uniform_real_distribution dist(min, max); + for (int64_t i = 0; i < size; ++i) { + data[i] = static_cast(dist(*engine)); + } +} + +template +void UniformRandomRawKernel(const Context &dev_ctx, + const ScalarArray &shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor *out) { + out->Resize(phi::make_ddim(shape.GetData())); + VLOG(4) << out->dims(); + T *data = dev_ctx.template Alloc(out); + auto size = out->numel(); + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetGenerator()->GetCPUEngine(); + } + UniformRealDistribution(data, size, min, max, engine); + if (diag_num > 0) { + PADDLE_ENFORCE_GT( + size, + (diag_num - 1) * (diag_step + 1), + phi::errors::InvalidArgument( + "ShapeInvalid: the diagonal's elements is equal (num-1) " + "* (step-1) with num %d, step %d," + "It should be smaller than %d, but received %d", + diag_num, + diag_step, + (diag_num - 1) * (diag_step + 1), + size)); + for (int64_t i = 0; i < diag_num; ++i) { + int64_t pos = i * diag_step + i; + data[pos] = diag_val; + } + } +} + +template +void UniformRandomKernel(const Context &dev_ctx, + const ScalarArray &shape, + DataType dtype, + float min, + float max, + int seed, + DenseTensor *out) { + UniformRandomRawKernel( + dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(uniform_random_raw, + CPU, + ALL_LAYOUT, + phi::UniformRandomRawKernel, + float, + double, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(uniform_random, + CPU, + ALL_LAYOUT, + phi::UniformRandomKernel, + float, + double, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/where_grad_kernel.cc b/paddle/phi/kernels/cpu/where_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..67c8cee1038c7a990e5961a3fcd17e8d7c591207 --- /dev/null +++ b/paddle/phi/kernels/cpu/where_grad_kernel.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/where_grad_kernel.h" + +namespace phi { + +template +void WhereGradKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + const auto* cond_data = condition.data(); + auto numel = condition.numel(); + auto* dout = out_grad.data(); + + if (x_grad != nullptr) { + auto* dx = ctx.template Alloc(x_grad); + for (int i = 0; i < numel; i++) { + dx[i] = dout[i] * (cond_data[i] ? 1. : 0.); + } + } + if (y_grad != nullptr) { + auto* dy = ctx.template Alloc(y_grad); + for (int i = 0; i < numel; i++) { + dy[i] = dout[i] * (cond_data[i] ? 0. : 1.); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(where_grad, + CPU, + ALL_LAYOUT, + phi::WhereGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/where_kernel.cc b/paddle/phi/kernels/cpu/where_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f624c13c262296964cef6b98f7d5d26dfc0b7d56 --- /dev/null +++ b/paddle/phi/kernels/cpu/where_kernel.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/where_kernel.h" + +namespace phi { + +template +void WhereKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + const bool* cond_data = condition.data(); + const T* x_data = x.data(); + const T* y_data = y.data(); + auto x_numel = x.numel(); + + T* out_data = ctx.template Alloc(out); + + for (int i = 0; i < x_numel; i++) { + out_data[i] = cond_data[i] ? x_data[i] : y_data[i]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + where, CPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h new file mode 100644 index 0000000000000000000000000000000000000000..9382b03cf9368cc726235a753a1990baacb60d52 --- /dev/null +++ b/paddle/phi/kernels/funcs/aligned_vector.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.1 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.1 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/hostdevice.h" + +namespace phi { + +// Aligned vector generates vectorized load/store on CUDA. +template +struct alignas(sizeof(T) * Size) AlignedVector { + T val[Size]; + + HOSTDEVICE inline const T& operator[](int i) const { return val[i]; } + HOSTDEVICE inline T& operator[](int i) { return val[i]; } +}; + +template +HOSTDEVICE inline void Load(const T* addr, AlignedVector* vec) { + const AlignedVector* addr_vec = + reinterpret_cast*>(addr); + *vec = *addr_vec; +} + +template +HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { + AlignedVector* addr_vec = + reinterpret_cast*>(addr); + *addr_vec = vec; +} + +/* +* Only the address of input data is the multiplier of 1,2,4, vectorized load +* with corresponding multiplier-value is possible. Moreover, the maximum length +* of vectorized load is 128 bits once. Hence, valid length of vectorized load +* shall be determined under both former constraints. +*/ +template +int GetVectorizedSize(const T* pointer) { + constexpr int max_load_bits = 128; + int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); + uint64_t address = reinterpret_cast(pointer); + constexpr int vec8 = std::alignment_of>::value; // NOLINT + constexpr int vec4 = std::alignment_of>::value; // NOLINT + constexpr int vec2 = std::alignment_of>::value; // NOLINT + if (address % vec8 == 0) { + /* + * Currently, decide to deal with no more than 4 data once while adopting + * vectorization load/store, if performance test shows that dealing with + * 8 data once in vectorization load/store does get optimized, return code + * below can be changed into " return std::min(8, valid_vec_size); " . + */ + return std::min(4, valid_vec_size); + } else if (address % vec4 == 0) { + return std::min(4, valid_vec_size); + } else if (address % vec2 == 0) { + return std::min(2, valid_vec_size); + } else { + return 1; + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..49e1c82482c0f14a665380e1b55e8f7bd67b1e30 --- /dev/null +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -0,0 +1,249 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef __NVCC__ +#include +#endif +#ifdef __HIPCC__ +#include +#endif + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/generator.h" + +#include "paddle/phi/kernels/funcs/index_impl.cu.h" + +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/phi/kernels/primitive/kernel_primitives.h" +#endif + +#if !defined(_WIN32) +#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) +#else +// there is no equivalent intrinsics in msvc. +#define UNLIKELY(condition) (condition) +#endif + +namespace phi { +namespace distribution { + +/********************* Transformation Function **********************/ +template +struct exponential_transform { + explicit exponential_transform(T lambda) : lambda_(lambda) {} + + HOSTDEVICE inline T operator()(T val) const { +#if defined(__NVCC__) || defined(__HIPCC__) + if (std::is_same::value) { + return static_cast(-1.0) / lambda_ * log(val); + } else { + return static_cast(-1.0) / lambda_ * __logf(val); + } +#else + return static_cast(-1.0) / lambda_ * std::log(static_cast(1.0) - val); +#endif + } + + private: + T lambda_; +}; + +template +struct uniform_transform { + explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {} + + HOSTDEVICE inline T operator()(T val) const { + if (UNLIKELY(val == static_cast(1.0))) { + return min_; + } else { + return val * range_ + min_; + } + } + + private: + T range_; + T min_; +}; + +template +struct normal_transform { + explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {} + + HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; } + + private: + T mean_; + T std_; +}; + +#if defined(__NVCC__) || defined(__HIPCC__) + +namespace kps = phi::kps; + +/*********************** Distribution Function *************************/ +template +struct uniform_distribution; + +template +struct normal_distribution; + +#if defined(__NVCC__) +template <> +struct uniform_distribution { + __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { + return curand_uniform4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline double2 operator()( + curandStatePhilox4_32_10_t *state) const { + return curand_uniform2_double(state); + } + static constexpr int kReturnsCount = 2; +}; + +template <> +struct normal_distribution { + __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { + return curand_normal4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct normal_distribution { + __device__ inline double2 operator()( + curandStatePhilox4_32_10_t *state) const { + return curand_normal2_double(state); + } + static constexpr int kReturnsCount = 2; +}; + +#else +template <> +struct uniform_distribution { + __device__ inline float4 operator()( + hiprandStatePhilox4_32_10_t *state) const { + return hiprand_uniform4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline double2 operator()( + hiprandStatePhilox4_32_10_t *state) const { + return hiprand_uniform2_double(state); + } + static constexpr int kReturnsCount = 2; +}; + +template <> +struct normal_distribution { + __device__ inline float4 operator()( + hiprandStatePhilox4_32_10_t *state) const { + return hiprand_normal4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct normal_distribution { + __device__ inline double2 operator()( + hiprandStatePhilox4_32_10_t *state) const { + return hiprand_normal2_double(state); + } + static constexpr int kReturnsCount = 2; +}; +#endif + +/******** Launch GPU function of distribution and transformation *********/ +template +__global__ void DistributionKernel(size_t size, + uint64_t seed, + uint64_t offset, + DistOp dist, + TransformOp trans, + T *out_data, + size_t stride) { + size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); + static constexpr int kCount = DistOp::kReturnsCount; +#if defined(__NVCC__) + curandStatePhilox4_32_10_t state; + curand_init(seed, idx + THREAD_ID_X, offset, &state); + using SType = curandStatePhilox4_32_10_t; +#else + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, idx + THREAD_ID_X, offset, &state); + using SType = hiprandStatePhilox4_32_10_t; +#endif + size_t total_thread = GRID_NUM_X * BLOCK_NUM_X; + T args[kCount]; + T result[kCount]; + for (size_t i = idx; i < size; i += total_thread * kCount) { + kps::ElementwiseRandom(&args[0], dist, &state); + kps::ElementwiseUnary( + &result[0], &args[0], trans); + kps::WriteData( + out_data + i, &result[0], size - i, 1, stride, 1); + __syncthreads(); + } +} + +template +void distribution_and_transform(const GPUContext &dev_ctx, + DenseTensor *out, + DistOp dist, + TransformOp trans) { + T *out_data = dev_ctx.template Alloc(out); + auto size = out->numel(); + + int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); + auto gen_cuda = dev_ctx.GetGenerator(); + + size_t block_size = 256; + size_t expect_grid_size = (size + block_size - 1) / block_size; + const auto &prop = backends::gpu::GetDeviceProperties(device_id); + size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) * + prop.multiProcessorCount; + size_t grid_size = + expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size; + + size_t total_thread = block_size * grid_size; + size_t curand4_loop_times = + (size + 4 * total_thread - 1) / (4 * total_thread); + // 'increment' shoulde be multiple of 4 + uint64_t increment = curand4_loop_times * 4; + + auto seed_offset = gen_cuda->IncrementOffset(increment); + uint64_t seed = seed_offset.first; + uint64_t offset = seed_offset.second; + + DistributionKernel< + T, + DistOp, + TransformOp><<>>( + size, seed, offset, dist, trans, out_data, total_thread); +} + +#endif +} // namespace distribution +} // namespace phi diff --git a/paddle/phi/kernels/funcs/index_impl.cu.h b/paddle/phi/kernels/funcs/index_impl.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..ccb70fe25ddce3ec9fba984a86049213ac51e5fa --- /dev/null +++ b/paddle/phi/kernels/funcs/index_impl.cu.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" + +namespace phi { + +template +__global__ void VectorizedIndexKernel(T *out, + size_t numel, + size_t main_offset, + Functor func) { + size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; + size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + size_t args[VecSize]; + T result[VecSize]; + for (; data_offset < main_offset; data_offset += stride) { + kps::InitWithDataIndex(&args[0], data_offset); + kps::ElementwiseUnary( + &result[0], &args[0], func); + kps::WriteData( + out + data_offset, &result[0], BLOCK_NUM_X * VecSize); + } + size_t num = numel - data_offset; + if (num > 0) { + kps::InitWithDataIndex(&args[0], data_offset); + kps::ElementwiseUnary( + &result[0], &args[0], func); + kps::WriteData(out + data_offset, &result[0], num); + } +} + +template +void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) { + int numel = out->numel(); + T *out_data = dev_ctx.template Alloc(out); + if (numel <= 0) return; + int vec_size = phi::GetVectorizedSize(out_data); +#ifdef PADDLE_WITH_XPU_KP + int block = 64; + int grid = 8; + auto stream = dev_ctx.x_context()->xpu_stream; +#else + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); + int grid = config.block_per_grid.x; + int block = config.thread_per_block.x; + auto stream = dev_ctx.stream(); +#endif + size_t main_offset = (numel / (vec_size * block)) * vec_size * block; + switch (vec_size) { + case 4: + VectorizedIndexKernel<<>>( + out_data, numel, main_offset, func); + break; + case 2: + VectorizedIndexKernel<<>>( + out_data, numel, main_offset, func); + break; + case 1: + VectorizedIndexKernel<<>>( + out_data, numel, main_offset, func); + break; + default: { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported vectorized size: %d !", vec_size)); + break; + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/logical_functor.h b/paddle/phi/kernels/funcs/logical_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..1ea7fc43e6b32c85e446044011b0c2ab3c79817c --- /dev/null +++ b/paddle/phi/kernels/funcs/logical_functor.h @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +#define LOGICAL_BINARY_FUNCTOR(func_name, op) \ + template \ + struct func_name { \ + using ELEMENT_TYPE = T; \ + HOSTDEVICE bool operator()(const T a, const T b) const { \ + return static_cast(a) op static_cast(b); \ + } \ + }; + +LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||) +LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&) +LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^) +#undef LOGICAL_BINARY_FUNCTOR + +template +struct LogicalNotFunctor { + using ELEMENT_TYPE = T; + HOSTDEVICE bool operator()(const T a) const { return !a; } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu index 1cc3311c3639820ef9b6d3a29d9274ac93bb5963..6652d242de5ce44f3bf64d91e6fae16c648c2726 100644 --- a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" + #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/atan2_grad_kernel.h" -#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" PD_REGISTER_KERNEL(atan2_grad, GPU, diff --git a/paddle/phi/kernels/gpu/atan2_kernel.cu b/paddle/phi/kernels/gpu/atan2_kernel.cu index 702c959b78f75d0e52511d9bdc9d4330c6838aa4..dd0bba177defef7cdbd41ef7944110d126ca2d7c 100644 --- a/paddle/phi/kernels/gpu/atan2_kernel.cu +++ b/paddle/phi/kernels/gpu/atan2_kernel.cu @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" + #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/atan2_kernel.h" -#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" PD_REGISTER_KERNEL(atan2, GPU, diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu index 7a6c99c5fe15f6ddecd190d2d77e359503be7a80..569a46f56d5638584262c0d1c8002459fa8ffd70 100644 --- a/paddle/phi/kernels/gpu/cast_kernel.cu +++ b/paddle/phi/kernels/gpu/cast_kernel.cu @@ -80,8 +80,4 @@ void CastKernel(const Context& dev_ctx, paddle::experimental::DataType::UNDEFINED); \ } -#if !defined(PADDLE_WITH_HIP) PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, phi::dtype::bfloat16) -#else -PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast) -#endif diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8b1ef964124d7d61004ba4cb9f3c53f7c5cec347 --- /dev/null +++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu @@ -0,0 +1,146 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_sample_grad_kernel.h" + +#include +#include +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +namespace { +template +void LimitGridDim(const Context& ctx, dim3* grid_dim) { + auto max_grid_dim = + reinterpret_cast(ctx).GetCUDAMaxGridDimSize(); + grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; + grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; +} +#define PREDEFINED_BLOCK_SIZE_X 512 +#define PREDEFINED_BLOCK_SIZE 1024 +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +}; + +template +__global__ void IndexSampleGrad(const IndexT* index, + T* in_grad, + const T* out_grad, + size_t index_length, + size_t input_length, + size_t batch_size, + bool same_data_in_row = true) { + unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; + + for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { + index_i = blockDim.x * blockIdx.x + threadIdx.x; + for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { + unsigned int index_idx = index_j * index_length + index_i; + unsigned int in_idx = index_j * input_length + index_i; + IndexT sample_idx = index[index_idx]; + if (same_data_in_row) { + paddle::platform::CudaAtomicAdd( + &(in_grad[in_idx - index_i + sample_idx]), out_grad[sample_idx]); + } else { + in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx]; + } + } + } +} + +template +void IndexSampleGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* x_grad) { + const T* output_grad_data = out_grad.data(); + T* input_grad_data = ctx.template Alloc(x_grad); + auto index_type = index.dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType((DataType::INT64))))); + + auto stream = reinterpret_cast(ctx).stream(); + auto input_num = x.numel(); + auto input_dim = x.dims(); + auto index_dim = index.dims(); + size_t batch_size = index_dim[0]; + size_t input_length = input_dim[1]; + size_t index_length = index_dim[1]; + bool same_data_in_index_row = index_length == 1 ? false : true; + + auto block_width = paddle::platform::RoundToPowerOfTwo(index_length); + block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); + auto block_height = + paddle::platform::RoundToPowerOfTwo(index_length * batch_size) / + block_width; + block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); + dim3 block_dim(block_width, block_height); + dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, + (batch_size + block_dim.y - 1) / block_dim.y); + LimitGridDim(ctx, &grid_dim); + + phi::funcs::SetConstant set_zero; + set_zero(ctx, x_grad, static_cast(0)); + + if (index_type == DataType::INT64) { + const int64_t* index_data = index.data(); + IndexSampleGrad<<>>( + index_data, + input_grad_data, + output_grad_data, + index_length, + input_length, + batch_size, + same_data_in_index_row); + } else if (index_type == DataType::INT32) { + const int* index_data = index.data(); + IndexSampleGrad<<>>( + index_data, + input_grad_data, + output_grad_data, + index_length, + input_length, + batch_size, + same_data_in_index_row); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(index_sample_grad, + GPU, + ALL_LAYOUT, + phi::IndexSampleGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..0e042089e1e3d0a20bf3811de3633f5fea0584fa --- /dev/null +++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_sample_kernel.h" + +#include +#include +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +namespace { +template +void LimitGridDim(const Context& ctx, dim3* grid_dim) { + auto max_grid_dim = + reinterpret_cast(ctx).GetCUDAMaxGridDimSize(); + grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; + grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; +} +#define PREDEFINED_BLOCK_SIZE_X 512 +#define PREDEFINED_BLOCK_SIZE 1024 +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +} + +template +__global__ void IndexSampleForward(const IndexT* index, + const T* in_data, + T* out_data, + size_t index_length, + size_t input_length, + size_t batch_size) { + unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; + for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { + index_i = blockDim.x * blockIdx.x + threadIdx.x; + for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { + unsigned int index_idx = index_j * index_length + index_i; + unsigned int in_idx = index_j * input_length + index_i; + IndexT sample_idx = index[index_idx]; + out_data[index_idx] = in_data[in_idx - index_i + sample_idx]; + } + } +} + +template +void IndexSampleKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* out) { + auto index_type = index.dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType((DataType::INT64))))); + const T* in_data = x.data(); + T* out_data = ctx.template Alloc(out); + auto stream = reinterpret_cast(ctx).stream(); + auto input_dim = x.dims(); + auto index_dim = index.dims(); + size_t batch_size = input_dim[0]; + size_t input_length = input_dim[1]; + size_t index_length = index_dim[1]; + + auto block_width = paddle::platform::RoundToPowerOfTwo(index_length); + block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); + int block_height = + paddle::platform::RoundToPowerOfTwo(index_length * batch_size) / + block_width; + block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); + dim3 block_dim(block_width, block_height); + dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, + (batch_size + block_dim.y - 1) / block_dim.y); + LimitGridDim(ctx, &grid_dim); + + if (index_type == DataType::INT64) { + const int64_t* index_data = index.data(); + IndexSampleForward<<>>( + index_data, in_data, out_data, index_length, input_length, batch_size); + } else if (index_type == DataType::INT32) { + const int* index_data = index.data(); + IndexSampleForward<<>>( + index_data, in_data, out_data, index_length, input_length, batch_size); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(index_sample, + GPU, + ALL_LAYOUT, + phi::IndexSampleKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/gpu/logical_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f32d4c77d4059f4c6c0157fc839d3fa345ed489c --- /dev/null +++ b/paddle/phi/kernels/gpu/logical_kernel.cu @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/logical_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/logical_functor.h" +#include "paddle/phi/kernels/gpu/elementwise.h" + +namespace phi { + +#define DEFINE_LOGICAL_BINARY_KERNEL(type) \ + template \ + void Logical##type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + using InT = typename funcs::Logical##type##Functor::ELEMENT_TYPE; \ + using OutT = bool; \ + dev_ctx.template Alloc(out); \ + funcs::Logical##type##Functor binary_func; \ + std::vector ins = {&x, &y}; \ + std::vector outs = {out}; \ + funcs::BroadcastKernel( \ + dev_ctx, ins, &outs, -1, binary_func); \ + } + +DEFINE_LOGICAL_BINARY_KERNEL(And) +DEFINE_LOGICAL_BINARY_KERNEL(Or) +DEFINE_LOGICAL_BINARY_KERNEL(Xor) +#undef DEFINE_LOGICAL_BINARY_KERNEL + +template +void LogicalNotKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + using InT = typename funcs::LogicalNotFunctor::ELEMENT_TYPE; + using OutT = bool; + + dev_ctx.template Alloc(out); + funcs::LogicalNotFunctor unary_func; + std::vector ins = {&x}; + std::vector outs = {out}; + funcs::BroadcastKernel( + dev_ctx, ins, &outs, -1, unary_func); +} + +} // namespace phi + +#define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \ + PD_REGISTER_KERNEL(logical_and, \ + GPU, \ + ALL_LAYOUT, \ + phi::Logical##func_type##Kernel, \ + float, \ + double, \ + bool, \ + int64_t, \ + int, \ + int8_t, \ + int16_t) {} + +REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And) +REGISTER_LOGICAL_CUDA_KERNEL(logical_or, Or) +REGISTER_LOGICAL_CUDA_KERNEL(logical_not, Not) +REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, Xor) diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index 56e8b16ccbe0df16fdc96470a8167e6dc6abfb3c..fc73ccca6de18ea169b60fc6e998d42a8cb03919 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -155,6 +155,7 @@ PD_REGISTER_KERNEL(sum_raw, float, double, float16, + bfloat16, int16_t, int, int64_t, diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu index d9c8de21c5bc2d26cb371d03be30ed0616a27a64..930c50a24be8fae40535c2d5e6dbbe85e7ced990 100644 --- a/paddle/phi/kernels/gpu/scale_kernel.cu +++ b/paddle/phi/kernels/gpu/scale_kernel.cu @@ -70,6 +70,7 @@ PD_REGISTER_KERNEL(scale, float, double, phi::dtype::float16, + phi::dtype::bfloat16, uint8_t, int8_t, int16_t, diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu index a698b9e716140b59b10a5799647e0a1aa7a8261d..d2473d5b0b110a122247c32c779b7a700c3249b1 100644 --- a/paddle/phi/kernels/gpu/split_kernel.cu +++ b/paddle/phi/kernels/gpu/split_kernel.cu @@ -27,20 +27,6 @@ void SplitKernel(const Context& dev_ctx, const ScalarArray& num_or_sections, const Scalar& axis_scalar, std::vector outs) { - // need to infershape output - if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) { - std::vector out_metas; - for (size_t i = 0; i < outs.size(); ++i) { - out_metas.push_back(outs[i]); - } - - phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true); - - for (size_t i = 0; i < out_metas.size(); ++i) { - outs[i]->Resize(out_metas[i].dims()); - } - } - std::vector shape_refer; for (size_t j = 0; j < outs.size(); ++j) { dev_ctx.template Alloc(outs[j]); diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..7f24a6667e562e64d8b523dd3ab1883af27bed5a --- /dev/null +++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu @@ -0,0 +1,163 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/uniform_random_kernel.h" + +#include "gflags/gflags.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" + +DECLARE_bool(use_curand); + +namespace phi { + +template +struct UniformGenerator { + T min_, max_; + unsigned int seed_; + T diag_val_; + unsigned int diag_num_; + unsigned int diag_step_; + __host__ __device__ UniformGenerator( + T min, T max, int seed, int diag_num, int diag_step, T diag_val) + : min_(min), + max_(max), + seed_(seed), + diag_num_(diag_num), + diag_step_(diag_step), + diag_val_(diag_val) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n); + T out = dist(rng); + unsigned int remainder = n % (diag_step_ + 1); + if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { + out = diag_val_; + } + return out; + } +}; + +template +struct UniformGeneratorOffset { + T min_, max_; + unsigned int seed_; + T diag_val_; + unsigned int diag_num_; + unsigned int diag_step_; + int offset_; + __host__ __device__ UniformGeneratorOffset(T min, + T max, + int seed, + int diag_num, + int diag_step, + T diag_val, + int offset) + : min_(min), + max_(max), + seed_(seed), + diag_num_(diag_num), + diag_step_(diag_step), + diag_val_(diag_val), + offset_(offset) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n + offset_); + T out = dist(rng); + unsigned int remainder = n % (diag_step_ + 1); + if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { + out = diag_val_; + } + return out; + } +}; + +template +void UniformRandomRawKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* out) { + out->Resize(phi::make_ddim(shape.GetData())); + T* data = dev_ctx.template Alloc(out); + auto size = out->numel(); + bool seed_flag = false; + if (seed == 0) { + std::random_device rd; + seed = rd(); + seed_flag = true; + } + + auto generator = dev_ctx.GetGenerator(); + if (generator->GetIsInitPy() && seed_flag) { + if (FLAGS_use_curand) { + using MT = typename kps::details::MPTypeTrait::Type; + distribution::uniform_distribution dist; + distribution::uniform_transform trans(min, max); + distribution::distribution_and_transform(dev_ctx, out, dist, trans); + } else { + auto seed_offset = generator->IncrementOffset(1); + int64_t gen_offset = size * seed_offset.second; + auto func = UniformGeneratorOffset(min, + max, + seed_offset.first, + diag_num, + diag_step, + diag_val, + gen_offset); + IndexKernel>(dev_ctx, out, func); + } + } else { + auto func = + UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); + IndexKernel>(dev_ctx, out, func); + } +} + +template +void UniformRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + DenseTensor* out) { + UniformRandomRawKernel( + dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(uniform_random_raw, + GPU, + ALL_LAYOUT, + phi::UniformRandomRawKernel, + float, + double) {} + +PD_REGISTER_KERNEL( + uniform_random, GPU, ALL_LAYOUT, phi::UniformRandomKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/where_grad_kernel.cu b/paddle/phi/kernels/gpu/where_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f21aca80e21b30de8931b4fcd4ae3922be959958 --- /dev/null +++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/where_grad_kernel.h" + +namespace phi { + +template +__global__ void WhereGradCUDAKernel( + const int N, const T* dout, const bool* cond, T* dx, T* dy) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + for (; idx < N; idx += blockDim.x * gridDim.x) { + if (dx != nullptr) { + dx[idx] = cond[idx] ? dout[idx] : 0.; + } + if (dy != nullptr) { + dy[idx] = cond[idx] ? 0. : dout[idx]; + } + } +} + +template +void WhereGradKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + const bool* cond_data = condition.data(); + auto numel = condition.numel(); + auto* dout = out_grad.data(); + + T* dx = (x_grad != nullptr) ? ctx.template Alloc(x_grad) : nullptr; + T* dy = (y_grad != nullptr) ? ctx.template Alloc(y_grad) : nullptr; + + auto stream = ctx.stream(); + auto config = backends::gpu::GetGpuLaunchConfig1D(ctx, numel); + WhereGradCUDAKernel< + T><<>>( + numel, dout, cond_data, dx, dy); +} + +} // namespace phi + +PD_REGISTER_KERNEL(where_grad, + GPU, + ALL_LAYOUT, + phi::WhereGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..03c24eea3a95af1ed57f5c8df42b01fd09af1fa2 --- /dev/null +++ b/paddle/phi/kernels/gpu/where_kernel.cu @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/where_kernel.h" + +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" + +namespace phi { + +// Cond +template +struct CondFunctor { + inline HOSTDEVICE T operator()(const bool cond, const T x, const T y) const { + return cond ? x : y; + } +}; + +template +void WhereKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + std::vector ins = {&condition, &x, &y}; + std::vector outs = {out}; + ctx.template Alloc(out); + + CondFunctor func; + funcs::BroadcastKernel( + ctx, ins, &outs, -1, func); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h index d0dd18298518ab351918aa2492eb48d11d3cf1d7..0eff1378f41de9b31a35375f86ca69a427d19f4f 100644 --- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h @@ -14,9 +14,10 @@ #pragma once -#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/atan2_grad_kernel.h" -#include "paddle/phi/kernels/funcs/for_range.h" + +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h index 2cae914e2f61555377f7a41b3d89cdbb2b589247..7653032f2113c6e181673c57feaec2efd6472838 100644 --- a/paddle/phi/kernels/impl/atan2_kernel_impl.h +++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h @@ -14,9 +14,10 @@ #pragma once -#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/atan2_kernel.h" -#include "paddle/phi/kernels/funcs/for_range.h" + +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/core/dense_tensor.h" namespace phi { template diff --git a/paddle/phi/kernels/index_sample_grad_kernel.h b/paddle/phi/kernels/index_sample_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..5c6e101f1b43df04d58da25fd7252f0ff929386e --- /dev/null +++ b/paddle/phi/kernels/index_sample_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSampleGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* in_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/index_sample_kernel.h b/paddle/phi/kernels/index_sample_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..fb43c0c6c5f97c6d47381c72786c6e44441e7762 --- /dev/null +++ b/paddle/phi/kernels/index_sample_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSampleKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/logical_kernel.h b/paddle/phi/kernels/logical_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..3ccc03a5b598a0a939cde00d74e1f6126808f655 --- /dev/null +++ b/paddle/phi/kernels/logical_kernel.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +#define DECLEAR_LOGICAL_BINARY_KERNEL(type) \ + template \ + void Logical##type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out); + +DECLEAR_LOGICAL_BINARY_KERNEL(And) +DECLEAR_LOGICAL_BINARY_KERNEL(Or) +DECLEAR_LOGICAL_BINARY_KERNEL(Xor) +#undef DECLEAR_LOGICAL_BINARY_KERNEL + +template +void LogicalNotKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index 3cb7b66ddf73e5fa3c5502a4acaad2c277a22ac6..480eb56c8b05c12c36337d4649a17b3b03146fdf 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -165,6 +165,7 @@ PD_REGISTER_KERNEL(sum, float, double, phi::dtype::float16, + phi::dtype::bfloat16, int16_t, int, int64_t, diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..881180b71b151aa48a16dcd15871d4a7cd656fb7 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/uniform_random_kernel.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void UniformRandomRawSRKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + SelectedRows* out) { + phi::UniformRandomRawKernel(dev_ctx, + shape, + dtype, + min, + max, + seed, + diag_num, + diag_step, + diag_val, + out->mutable_value()); +} + +template +void UniformRandomSRKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + SelectedRows* out) { + phi::UniformRandomKernel( + dev_ctx, shape, dtype, min, max, seed, out->mutable_value()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(uniform_random_raw_sr, + CPU, + ALL_LAYOUT, + phi::UniformRandomRawSRKernel, + float, + double, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(uniform_random_sr, + CPU, + ALL_LAYOUT, + phi::UniformRandomSRKernel, + float, + double, + phi::dtype::bfloat16) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +PD_REGISTER_KERNEL(uniform_random_raw_sr, + GPU, + ALL_LAYOUT, + phi::UniformRandomRawSRKernel, + float, + double) {} + +PD_REGISTER_KERNEL(uniform_random_sr, + GPU, + ALL_LAYOUT, + phi::UniformRandomSRKernel, + float, + double) {} +#endif diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h index 1e730d809bc3a225d8dc34d24bde48f857b7ca9a..840fe4366ce7eaca82608612dfb41cc7f7783f4c 100644 --- a/paddle/phi/kernels/split_kernel.h +++ b/paddle/phi/kernels/split_kernel.h @@ -43,18 +43,18 @@ std::vector Split(const Context& dev_ctx, } std::vector out_meta; + std::vector out_meta_ptr; out_meta.reserve(out_number); + out_meta_ptr.reserve(out_number); std::vector result; result.reserve(out_number); for (size_t i = 0; i < out_number; ++i) { - auto dense_out = phi::Empty(dev_ctx); - MetaTensor tmp_meta(&dense_out); - - result.push_back(dense_out); - out_meta.push_back(&result.back()); + result.emplace_back(phi::Empty(dev_ctx)); + out_meta.emplace_back(&result.back()); + out_meta_ptr.push_back(&out_meta.back()); } - SplitInferMeta(x, num_or_sections, axis, &out_meta); + SplitInferMeta(x, num_or_sections, axis, out_meta_ptr); std::vector outs; outs.reserve(out_meta.size()); diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..5bba127278541e61b142dbeb7d00f10ed3f8437b --- /dev/null +++ b/paddle/phi/kernels/uniform_random_kernel.h @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { + +template +void UniformRandomRawKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* out); + +template +void UniformRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + DenseTensor* out); + +template +void UniformRandomRawSRKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + SelectedRows* out); + +template +void UniformRandomSRKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + SelectedRows* out); + +} // namespace phi diff --git a/paddle/phi/kernels/where_grad_kernel.h b/paddle/phi/kernels/where_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..1a3c66ee6ed8403d0b453ed38d21e4beed02661c --- /dev/null +++ b/paddle/phi/kernels/where_grad_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void WhereGradKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/where_kernel.h b/paddle/phi/kernels/where_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..254271ac9c7238c66d09ffe41d12e29fe8f23237 --- /dev/null +++ b/paddle/phi/kernels/where_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void WhereKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc index 574f4e991a260e8ebc250fe3f8461736dc3eb7f8..d43126d56e88c868d4e273aaf13bd71bc570d37c 100644 --- a/paddle/phi/kernels/xpu/full_kernel.cc +++ b/paddle/phi/kernels/xpu/full_kernel.cc @@ -59,7 +59,7 @@ void FullKernel(const Context& dev_ctx, const Scalar& val, DataType dtype, DenseTensor* out) { - out->ResizeAndAllocate(phi::make_ddim(shape.GetData())); + out->Resize(phi::make_ddim(shape.GetData())); FullValueXPU(dev_ctx, out, val.to()); } @@ -69,6 +69,7 @@ void FullLikeKernel(const Context& dev_ctx, const Scalar& val, DataType dtype, DenseTensor* out) { + dev_ctx.template Alloc(out); auto value = val.to(); using XPUInTDType = typename XPUTypeTrait::Type; using CommonType = typename std::common_type< diff --git a/paddle/phi/ops/compat/index_sample_sig.cc b/paddle/phi/ops/compat/index_sample_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d2aed68a72a5e3aa762b4adbcf7c6e39869b927 --- /dev/null +++ b/paddle/phi/ops/compat/index_sample_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature IndexSampleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("index_sample_grad", + {GradVarName("Out"), "X", "Index"}, + {}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(index_sample_grad, + phi::IndexSampleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/uniform_random_sig.cc b/paddle/phi/ops/compat/uniform_random_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..d06d4026f4f5f81a07eee2131b7df7808592132b --- /dev/null +++ b/paddle/phi/ops/compat/uniform_random_sig.cc @@ -0,0 +1,159 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature UniformRandomOpArgumentMapping( + const ArgumentMappingContext& ctx) { + int diag_num = paddle::any_cast(ctx.Attr("diag_num")); + if (ctx.IsDenseTensorOutput("Out")) { + if (diag_num) { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature("uniform_random_raw", + {}, + {"ShapeTensorList", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("uniform_random_raw", + {}, + {"ShapeTensor", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } else { + return KernelSignature("uniform_random_raw", + {}, + {"shape", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } + } + } else { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature( + "uniform_random", + {}, + {"ShapeTensorList", "dtype", "min", "max", "seed"}, + {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("uniform_random", + {}, + {"ShapeTensor", "dtype", "min", "max", "seed"}, + {"Out"}); + } else { + return KernelSignature("uniform_random", + {}, + {"shape", "dtype", "min", "max", "seed"}, + {"Out"}); + } + } + } + } else if (ctx.IsSelectedRowsOutput("Out")) { + if (diag_num) { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature("uniform_random_raw_sr", + {}, + {"ShapeTensorList", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("uniform_random_raw_sr", + {}, + {"ShapeTensor", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } else { + return KernelSignature("uniform_random_raw_sr", + {}, + {"shape", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } + } + } else { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature( + "uniform_random_sr", + {}, + {"ShapeTensorList", "dtype", "min", "max", "seed"}, + {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("uniform_random_sr", + {}, + {"ShapeTensor", "dtype", "min", "max", "seed"}, + {"Out"}); + } else { + return KernelSignature("uniform_random_sr", + {}, + {"shape", "dtype", "min", "max", "seed"}, + {"Out"}); + } + } + } + } + return KernelSignature("unregistered", {}, {}, {}); +} +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(uniform_random, phi::UniformRandomOpArgumentMapping); diff --git a/paddle/phi/ops/compat/where_grad_sig.cc b/paddle/phi/ops/compat/where_grad_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..71984a26d35afd841654d82480c263799bdbf181 --- /dev/null +++ b/paddle/phi/ops/compat/where_grad_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("where_grad", + {"Condition", "X", "Y", GradVarName("Out")}, + {}, + {GradVarName("X"), GradVarName("Y")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(where_grad, phi::WhereGradOpArgumentMapping); diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc index fa4ffc84bf587defae06deb18dae283a64206b75..5d6862c368c57bc7dbfba2bc9eab960818c25e05 100644 --- a/paddle/phi/tests/common/test_backend.cc +++ b/paddle/phi/tests/common/test_backend.cc @@ -44,6 +44,9 @@ TEST(Backend, OStream) { oss << phi::Backend::GPUDNN; EXPECT_EQ(oss.str(), "GPUDNN"); oss.str(""); + oss << phi::Backend::KPS; + EXPECT_EQ(oss.str(), "KPS"); + oss.str(""); try { oss << phi::Backend::NUM_BACKENDS; } catch (const std::exception& exception) { @@ -61,6 +64,7 @@ TEST(Backend, StringToBackend) { EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU")); EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN")); EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN")); + EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS")); EXPECT_EQ(static_cast( static_cast(phi::Backend::NUM_BACKENDS) + 1), pexp::StringToBackend("CustomBackend")); diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc index d8e42c9d0d8b11d393dbb71776671d9cb50a7715..69922c055cbac5fe3c3947d0d8d63ee4a1262a4c 100644 --- a/paddle/phi/tests/core/test_custom_kernel.cc +++ b/paddle/phi/tests/core/test_custom_kernel.cc @@ -146,12 +146,10 @@ TEST(CustomKernel, custom_kernel_dot) { custom_fake_dot_kernels.end()); // 3.before register - auto& kernel_factory_instance = phi::KernelFactory::Instance(); auto& kernels = phi::KernelFactory::Instance().kernels(); - EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name)); + EXPECT_TRUE(kernels.find(op_name) == kernels.end()); - // mock fake_dot is supported by phi for HasCompatiblePhiKernel check while - // registering + // mock fake_dot is supported by phi for check while registering auto& fake_dot_kernels = kernels[op_name]; EXPECT_TRUE(fake_dot_kernels.find( @@ -196,7 +194,7 @@ TEST(CustomKernel, custom_kernel_dot) { fake_dot_kernels.end()); // 4.kernel select - auto kernel = kernel_factory_instance.SelectKernelOrThrowError( + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( op_name, phi::KernelKey(backend, layout, phi::DataType::UINT8)); // 5.prepare parameters for kernel diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index d5bc2e6b5307bf477c928380070644aca3c67f62..9d9fbd39a5767ffe72ad579df2d31ac66eda2234 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -426,9 +426,6 @@ class Quant2Int8MkldnnPass(object): graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass') graph = self._apply_pass(graph, 'conv_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass') - graph = self._apply_pass(graph, 'conv_affine_channel_fuse_pass') - graph = self._apply_pass(graph, - 'conv_eltwiseadd_affine_channel_fuse_pass') graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_transpose_eltwiseadd_bn_fuse_pass') diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index ddb86848f842a85acc12dca1044a594c484c06fe..0049f387b707fc853699474b34235f177d4672af 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -560,13 +560,19 @@ class DataParallel(layers.Layer): strategy=None, comm_buffer_size=25, last_comm_buffer_size=1, - find_unused_parameters=False): + find_unused_parameters=False, + process_group=None, + gradient_as_buffer_view=False, + static_graph=False): super(DataParallel, self).__init__(layers.full_name() + "_data_parallel") self._layers = layers self.find_unused_parameters = find_unused_parameters self.grad_need_sync = True + self.process_group = process_group + self.gradient_as_buffer_view = gradient_as_buffer_view + self.static_graph = static_graph # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. # It just stores some environment variables, which can be constructed by diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 2361bd270623873384d3cea8cd11eb10a78ec116..7d64cf7bd894553de84293295dc737255e803613 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -590,7 +590,7 @@ foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4) -if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL) +if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL OR APPLE) py_test_modules(test_warpctc_op MODULES test_warpctc_op) set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120) endif() diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py index e0529c5d5f82cfccb1fd47705b2a4cda39c17827..00d2a1f71d6bd3f1d8ce3b8981be2b4732163340 100644 --- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py +++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py @@ -144,6 +144,11 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs): grad_clip = kwargs.get('grad_clip', None) clip_after_allreduce = kwargs.get('clip_after_allreduce', True) + parameters = [p.name for p in main.all_parameters()] + exclude_fn = lambda var: var.name in parameters[::4] + kwargs['exclude_from_weight_decay_fn'] = exclude_fn + kwargs['lamb_weight_decay'] = 0.1 + if use_distributed_lamb: optimizer_class = DistributedFusedLamb kwargs = dict(kwargs) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py deleted file mode 100644 index 5afaf08eec3b1324df312920bd9e8c8970fd7dbc..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from auto_scan_test import PassAutoScanTest, IgnoreReasons -from program_config import TensorConfig, ProgramConfig, OpConfig -import numpy as np -import paddle.inference as paddle_infer -from functools import partial -from typing import Optional, List, Callable, Dict, Any, Set -import unittest - -import hypothesis -from hypothesis import given, settings, seed, example, assume, reproduce_failure -import hypothesis.strategies as st - - -class TestConvAffineChannelFusePass(PassAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: - return True - - def sample_program_config(self, draw): - padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) - groups = draw(st.integers(min_value=1, max_value=3)) - data_format = draw(st.sampled_from(["NCHW", "NHWC"])) - axis = draw(st.sampled_from([1])) - filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4 - filter_size = draw(st.integers(min_value=1, max_value=4)) - in_channel = groups * filter_channel - out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4 - out_channel = groups * out_channel_factor - batch_size = draw(st.integers(min_value=1, max_value=4)) - dilations = draw( - st.lists( - st.integers( - min_value=1, max_value=2), min_size=2, max_size=2)) - paddings = draw( - st.lists( - st.integers( - min_value=0, max_value=2), min_size=2, max_size=2)) - strides = draw( - st.lists( - st.integers( - min_value=1, max_value=2), min_size=2, max_size=2)) - has_bias = draw(st.booleans()) - - x_shape = [ - batch_size, in_channel, 64, 64 - ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel] - w_shape = [out_channel, filter_channel, filter_size, filter_size] - scale_shape = [out_channel] - bias_shape = [out_channel] - - def generate_input(): - return np.random.random(x_shape).astype(np.float32) - - def generate_weight(): - return np.random.random(w_shape).astype(np.float32) - - def generate_bias(): - return np.random.random(bias_shape).astype(np.float32) - - def generate_scale_bias(): - return np.random.random(bias_shape).astype(np.float32) - - conv2d_op = OpConfig( - "conv2d", - inputs={ - "Input": ["input_data"], - "Filter": ["conv2d_weight"], - }, - outputs={"Output": ["conv_output"]}, - data_format=data_format, - dilations=dilations, - padding_algorithm=padding_algorithm, - groups=groups, - paddings=paddings, - strides=strides, - has_bias=has_bias, - is_test=True) - ac_op = OpConfig( - "affine_channel", - inputs={ - "X": ["conv_output"], - "Scale": ["affine_channel_scale"], - "Bias": ["affine_channel_bias"] - }, - outputs={"Out": ["affine_channel_ouput"]}, - data_layout=data_format) - if has_bias == True: - conv2d_op.inputs["Bias"] = ["conv2d_bias"] - ops = [conv2d_op, ac_op] - - program_config = ProgramConfig( - ops=ops, - inputs={ - "input_data": TensorConfig(data_gen=partial(generate_input)), - }, - weights={ - "conv2d_weight": - TensorConfig(data_gen=partial(generate_weight)), - "affine_channel_scale": - TensorConfig(data_gen=partial(generate_scale_bias)), - "affine_channel_bias": - TensorConfig(data_gen=partial(generate_scale_bias)), - }, - outputs=["affine_channel_ouput"]) - if has_bias == True: - program_config.weights["conv2d_bias"] = TensorConfig( - data_gen=partial(generate_bias)) - return program_config - - def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_gpu=True) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - config = self.create_inference_config(use_mkldnn=True) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - def add_ignore_pass_case(self): - # If the problem has been fixed, the judgment - # in is_program_valid needs to be deleted!!! - def teller1(program_config, predictor_config): - if program_config.ops[0].attrs['data_format'] == "NHWC": - return True - return False - - # mkldnn Output has diff with bias! - def teller2(program_config, predictor_config): - return predictor_config.mkldnn_enabled() and program_config.ops[ - 0].attrs['has_bias'] == True - - self.add_ignore_check_case( - teller1, IgnoreReasons.PASS_ACCURACY_ERROR, - "The output format of conv2d is wrong when data_format attribute is NHWC, \ - because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)." - ) - - self.add_ignore_check_case( - teller2, IgnoreReasons.PASS_ACCURACY_ERROR, - "Currently mkldnn Output has diff with bias!") - - def test(self): - self.run_and_statis( - quant=False, - passes=["conv_affine_channel_fuse_pass"], ) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py deleted file mode 100644 index a8bfdb79ca1daa5caa0cffb945fee76fdef36c36..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from auto_scan_test import PassAutoScanTest, IgnoreReasons -from program_config import TensorConfig, ProgramConfig, OpConfig -import numpy as np -import paddle.inference as paddle_infer -from functools import partial -from typing import Optional, List, Callable, Dict, Any, Set -import unittest - -import hypothesis -from hypothesis import given, settings, seed, example, assume -import hypothesis.strategies as st - - -class TestConvEltwiseAddAffineChannelFusePass(PassAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: - attrs = [ - program_config.ops[i].attrs - for i in range(len(program_config.ops)) - ] - - if attrs[0]['data_format'] == "NHWC" and attrs[1]['axis'] != 3: - return False - - return True - - def sample_program_config(self, draw): - padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) - groups = draw(st.integers(min_value=1, max_value=3)) - data_format = draw(st.sampled_from(["NCHW", "NHWC"])) - axis = draw(st.sampled_from([1])) - filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4 - filter_size = draw(st.integers(min_value=1, max_value=4)) - in_channel = groups * filter_channel - out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4 - out_channel = groups * out_channel_factor - batch_size = draw(st.integers(min_value=1, max_value=4)) - dilations = draw( - st.lists( - st.integers( - min_value=1, max_value=2), min_size=2, max_size=2)) - paddings = draw( - st.lists( - st.integers( - min_value=0, max_value=2), min_size=2, max_size=2)) - strides = draw( - st.lists( - st.integers( - min_value=1, max_value=2), min_size=2, max_size=2)) - has_bias = draw(st.booleans()) - - x_shape = [ - batch_size, in_channel, 64, 64 - ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel] - w_shape = [out_channel, filter_channel, filter_size, filter_size] - scale_shape = [out_channel] - bias_shape = [out_channel] - - def generate_input(): - return np.random.random(x_shape).astype(np.float32) - - def generate_weight(): - return np.random.random(w_shape).astype(np.float32) - - def generate_bias(): - return np.random.random(bias_shape).astype(np.float32) - - def generate_scale_bias(): - return np.random.random(bias_shape).astype(np.float32) - - conv2d_op = OpConfig( - "conv2d", - inputs={ - "Input": ["input_data"], - "Filter": ["conv2d_weight"], - }, - outputs={"Output": ["conv_output"]}, - data_format=data_format, - dilations=dilations, - padding_algorithm=padding_algorithm, - groups=groups, - paddings=paddings, - strides=strides, - has_bias=has_bias, - is_test=True) - eltwise_op = OpConfig( - "elementwise_add", - inputs={"X": ["conv_output"], - "Y": ["conv2d_bias"]}, - outputs={"Out": ["elementwise_output"]}, - axis=axis) - ac_op = OpConfig( - "affine_channel", - inputs={ - "X": ["elementwise_output"], - "Scale": ["affine_channel_scale"], - "Bias": ["affine_channel_bias"] - }, - outputs={"Out": ["affine_channel_ouput"]}, - data_layout=data_format) - if has_bias == True: - conv2d_op.inputs["Bias"] = ["conv2d_bias"] - ops = [conv2d_op, eltwise_op, ac_op] - program_config = ProgramConfig( - ops=ops, - inputs={ - "input_data": TensorConfig(data_gen=partial(generate_input)), - }, - weights={ - "conv2d_weight": - TensorConfig(data_gen=partial(generate_weight)), - "conv2d_bias": TensorConfig(data_gen=partial(generate_bias)), - "affine_channel_scale": - TensorConfig(data_gen=partial(generate_scale_bias)), - "affine_channel_bias": - TensorConfig(data_gen=partial(generate_scale_bias)), - }, - outputs=["affine_channel_ouput"]) - return program_config - - def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_gpu=True) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - config = self.create_inference_config(use_mkldnn=True) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - # TRT - config = self.create_trt_inference_config() - config.enable_tensorrt_engine( - workspace_size=1 << 20, - max_batch_size=4, - min_subgraph_size=1, - precision_mode=paddle_infer.PrecisionType.Float32, - use_static=False, - use_calib_mode=False) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - def add_ignore_pass_case(self): - # If the problem has been fixed, the judgment - # in is_program_valid needs to be deleted!!! - def teller1(program_config, predictor_config): - if program_config.ops[0].attrs['data_format'] == "NHWC": - return True - return False - - # mkldnn Output has diff with bias! - def teller2(program_config, predictor_config): - return predictor_config.mkldnn_enabled() and program_config.ops[ - 0].attrs['has_bias'] == True - - self.add_ignore_check_case( - teller1, IgnoreReasons.PASS_ACCURACY_ERROR, - "The output format of conv2d is wrong when data_format attribute is NHWC, \ - it will trigger Broadcast dimension mismatch bug \ - when data_format attribute is NHWC and axis of eltwise op is 1 for this pass." - ) - - self.add_ignore_check_case( - teller2, IgnoreReasons.PASS_ACCURACY_ERROR, - "Currently mkldnn Output has diff with bias!") - - def test(self): - self.run_and_statis( - quant=False, - passes=["conv_eltwiseadd_affine_channel_fuse_pass"], ) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 848ebae0706e3c62e0e0e6579cd3c04f02d43be4..628791afef5f66cd8eeddae7685d7a7ffdb6dd08 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -482,7 +482,12 @@ class OpTest(unittest.TestCase): op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) "infer datatype from inputs and outputs for this test case" - self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + if self.is_bfloat16_op(): + self.dtype = np.uint16 + self.__class__.dtype = self.dtype + self.output_dtype = np.uint16 + else: + self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) inputs = append_input_output(block, op_proto, self.inputs, True, self.dtype) outputs = append_input_output(block, op_proto, self.outputs, False, @@ -1135,7 +1140,7 @@ class OpTest(unittest.TestCase): else: atol = 2 else: - atol = 1e-2 + atol = 1e-1 if no_check_set is not None: if self.op_type not in no_check_set_white_list.no_check_set_white_list: diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py index 345dad54132bc8c2d8520bc86c3276f651893e99..1ae780f488d2dc6bf37f88505a67723ea867dd94 100644 --- a/python/paddle/fluid/tests/unittests/test_diff_op.py +++ b/python/paddle/fluid/tests/unittests/test_diff_op.py @@ -55,7 +55,7 @@ class TestDiffOp(unittest.TestCase): def test_dygraph(self): for place in self.places: - paddle.disable_static(place) + paddle.disable_static() x = paddle.to_tensor(self.input, place=place) if self.prepend is not None: self.prepend = paddle.to_tensor(self.prepend, place=place) diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py index 83b39a62f152d2c7e02abe313ffeeafe017d033d..978a3d86d882a2e0d59e8244a956f5c97a4bd9ef 100644 --- a/python/paddle/fluid/tests/unittests/test_gather_op.py +++ b/python/paddle/fluid/tests/unittests/test_gather_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle import paddle.fluid as fluid from paddle.framework import core @@ -117,6 +117,39 @@ class TestCase6(TestGatherOp): self.index_type = "int32" +class TestGatherBF16Op(OpTest): + def setUp(self): + self.op_type = "gather" + self.dtype = np.uint16 + self.config() + xnp = np.random.random(self.x_shape).astype(np.float32) + axis_np = np.array(self.axis).astype(self.axis_type) + index_np = np.array(self.index).astype(self.index_type) + self.inputs = { + 'X': convert_float_to_uint16(xnp), + 'Index': index_np, + 'Axis': axis_np + } + out = gather_numpy(self.inputs['X'], index_np, axis_np[0]) + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', numeric_grad_delta=0.5) + + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (3, 88, 3) + self.index = [1, 3, 5] + self.index_type = "int32" + self.axis = [1] + self.axis_type = "int32" + + class TestGatherOp1(OpTest): def setUp(self): self.op_type = "gather" diff --git a/python/paddle/fluid/tests/unittests/test_imperative_group.py b/python/paddle/fluid/tests/unittests/test_imperative_group.py index f96358096516e67af6269c321a2722c500489959..89535797ed09890df44939efbc531df53d710304 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_group.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_group.py @@ -26,159 +26,149 @@ import paddle.fluid.dygraph as dygraph from paddle.fluid.dygraph.nn import Linear import paddle.fluid.core as core from paddle.fluid.optimizer import SGDOptimizer - - -class MLP(fluid.Layer): - def __init__(self, param_attr=None, bias_attr=None): - super(MLP, self).__init__() - - self._linear1 = Linear(784, 10) - self._linear2 = Linear(10, 10) - - def forward(self, inputs): - y = self._linear1(inputs) - y = self._linear2(y) - return y +from paddle.fluid.framework import _test_eager_guard class TestDataParallelGroup(unittest.TestCase): - def create_varbase(self, dtype, shape, - type=core.VarDesc.VarType.LOD_TENSOR): - return core.VarBase(dtype, shape, "", type, True) + def create_varbase(self, dtype, shape): + return paddle.rand(shape=shape, dtype=dtype) + + def assign_group_by_size(self, *args): + return core.assign_group_by_size(*args) def test_construct_group0(self): # one dtype & one limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [2, 100])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - res = core.assign_group_by_size(var_list, [False, False, False, False], + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 100])) + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 25])) + res = self.assign_group_by_size(var_list, [False, False, False, False], [400]) self.assertEqual([[0], [1], [2], [3]], res) def test_construct_group1(self): # multi dtype & one limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [False, False, False, False, False, False], [400]) self.assertEqual([[0, 2], [1, 3], [4], [5]], res) def test_construct_group2(self): # one dtype & multi limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - res = core.assign_group_by_size(var_list, [False, False, False, False], + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 50])) + res = self.assign_group_by_size(var_list, [False, False, False, False], [400, 800]) self.assertEqual([[0], [1, 2], [3]], res) def test_construct_group3(self): # multi dtype & multi limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [False, False, False, False, False, False], [200, 400]) self.assertEqual([[0], [1], [2, 4], [3, 5]], res) def test_construct_group4(self): # multi dtype & zero limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [False, False, False, False, False, False], [0]) self.assertEqual([[0], [1], [2], [3], [4], [5]], res) def test_construct_group5(self): # multi dtype & infinite capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [False, False, False, False, False, False], [10000]) self.assertEqual([[0, 2, 4], [1, 3, 5]], res) def test_construct_group6(self): # multi dtype & limit capability & multi tensor type var_list = [] - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [1, 50], - core.VarDesc.VarType.SELECTED_ROWS)) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP64, [1, 25], - core.VarDesc.VarType.SELECTED_ROWS)) - res = core.assign_group_by_size( + var_list.append(self.create_varbase( + "float32", + [1, 50], )) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [True, False, False, False, False, True], [400]) self.assertEqual([[0], [1, 3], [2, 4], [5]], res) def test_construct_group7(self): # multi dtype & multi limit capability & multi tensor type var_list = [] - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [1, 50], - core.VarDesc.VarType.SELECTED_ROWS)) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP64, [1, 25], - core.VarDesc.VarType.SELECTED_ROWS)) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [True, False, False, False, False, True], [200, 400]) self.assertEqual([[0], [1], [2], [3], [4], [5]], res) def test_construct_group8(self): # one dtype & one limit capability & have tensor_indices var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [2, 100])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - res = core.assign_group_by_size(var_list, [False, False, False, False], + var_list.append(self.create_varbase("float32", [2, 25])) + var_list.append(self.create_varbase("float32", [2, 100])) + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 25])) + res = self.assign_group_by_size(var_list, [False, False, False, False], [400], [3, 0, 1, 2]) self.assertEqual([[3, 0], [1], [2]], res) def test_construct_group9(self): # one dtype & one limit capability & have tensor_indices var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [2, 1000])) - res = core.assign_group_by_size(var_list, [False, False, False, True], + var_list.append(self.create_varbase("float32", [2, 25])) + var_list.append(self.create_varbase("float32", [2, 25])) + var_list.append(self.create_varbase("float32", [2, 25])) + var_list.append(self.create_varbase("float32", [2, 1000])) + res = self.assign_group_by_size(var_list, [False, False, False, True], [300], [1, 0, 2, 3]) self.assertEqual([[1, 0], [3], [2]], res) +class TestDataParallelGroupEager(TestDataParallelGroup): + def create_varbase(self, dtype, shape): + with _test_eager_guard(): + return paddle.rand(shape=shape, dtype=dtype) + + def assign_group_by_size(self, *args): + return core.eager_assign_group_by_size(*args) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py index 7dd310d2b88a90e09ba5ceedb541da4be263e559..ca9a489c7496f33cb084f1cd43158cebc7a1add6 100644 --- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py @@ -375,6 +375,53 @@ class TestFP16ScaleBiasLayerNorm(unittest.TestCase): assert_equal(b_g_np_1, b_g_np_2) +class TestBF16ScaleBiasLayerNorm(unittest.TestCase): + def check_main(self, x_np, weight_np, bias_np, dtype): + paddle.disable_static() + + x = paddle.to_tensor(x_np) + weight = paddle.to_tensor(weight_np) + bias = paddle.to_tensor(bias_np) + + if dtype == "bfloat16": + x = x.cast(paddle.fluid.core.VarDesc.VarType.BF16) + + x.stop_gradient = False + weight.stop_gradient = False + bias.stop_gradient = False + + y = F.layer_norm(x, x.shape[1:], weight, bias) + x_g, w_g, b_g = paddle.grad(y, [x, weight, bias]) + + y_np = y.cast('float32').numpy() + x_g_np = x_g.cast('float32').numpy() + w_g_np = w_g.cast('float32').numpy() + b_g_np = b_g.cast('float32').numpy() + + paddle.enable_static() + return y_np, x_g_np, w_g_np, b_g_np + + def test_main(self): + if (not core.is_compiled_with_cuda()) or (core.cudnn_version() < 8100): + return + x_np = np.random.random([10, 20]).astype('float32') + weight_np = np.random.random([20]).astype('float32') + bias_np = np.random.random([20]).astype('float32') + + y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main( + x_np, weight_np, bias_np, 'float32') + y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main( + x_np, weight_np, bias_np, 'bfloat16') + + def assert_equal(x, y): + self.assertTrue(np.allclose(x, y, atol=1.e-1)) + + assert_equal(y_np_1, y_np_2) + assert_equal(x_g_np_1, x_g_np_2) + assert_equal(w_g_np_1, w_g_np_2) + assert_equal(b_g_np_1, b_g_np_2) + + class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase): def test_main(self): self.assertTrue(_keep_layer_norm_scale_bias_to_fp32()) diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py index b20305b78efe2dfe73e069e13f0d0eca3bb84057..575bc653618a583e883783cd1fffe1db371eccff 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_all.py +++ b/python/paddle/fluid/tests/unittests/test_norm_all.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle import paddle.fluid as fluid import paddle.fluid.core as core @@ -282,6 +282,80 @@ class TestPnormOpFP161(TestPnormOpFP16): self.asvector = True +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestPnormBF16Op(OpTest): + def setUp(self): + self.op_type = "p_norm" + self.init_test_case() + self.x = (np.random.random(self.shape) + 0.5).astype(np.float32) + self.norm = p_norm(self.x, self.axis, self.porder, self.keepdim, + self.asvector) + self.gradient = self.calc_gradient() + self.inputs = {'X': convert_float_to_uint16(self.x)} + self.attrs = { + 'epsilon': self.epsilon, + 'axis': self.axis, + 'keepdim': self.keepdim, + 'porder': float(self.porder), + 'asvector': self.asvector + } + self.outputs = {'Out': convert_float_to_uint16(self.norm)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', user_defined_grads=self.gradient) + + def init_test_case(self): + self.shape = [2, 3, 4, 5] + self.axis = 1 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = False + self.dtype = np.uint16 + self.asvector = False + + def calc_gradient(self): + self.attrs = { + 'epsilon': self.epsilon, + 'axis': self.axis, + 'keepdim': self.keepdim, + 'porder': float(self.porder), + 'asvector': self.asvector + } + x = self.x + porder = self.attrs["porder"] + axis = self.attrs["axis"] + asvector = self.attrs["asvector"] + x_dtype = x.dtype + x = x.astype(np.float32) if x.dtype == np.float16 else x + if porder == 0: + grad = np.zeros(x.shape).astype(x.dtype) + elif porder in [float("inf"), float("-inf")]: + norm = p_norm( + x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector) + x_abs = np.abs(x) + grad = np.sign(x) + grad[x_abs != norm] = 0.0 + else: + norm = p_norm( + x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector) + grad = np.power(norm, 1 - porder) * np.power( + np.abs(x), porder - 1) * np.sign(x) + + numel = 1 + for s in x.shape: + numel *= s + divisor = numel if asvector else x.shape[axis] + numel /= divisor + return [grad.astype(x_dtype) * 1 / numel] + + def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False): with fluid.program_guard(fluid.Program()): data = fluid.data(name="X", shape=shape_x, dtype=dtype) diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index faa67e1d6da8f44bf1a09036d0d1dc9e49ff462c..d246356b4ec75a96162d0b37d4d1cbfab9493440 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle import paddle.fluid.core as core import paddle.fluid as fluid @@ -61,6 +61,37 @@ class TestSumOp_fp16(OpTest): self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSumOp_bf16(OpTest): + def setUp(self): + np.random.seed(100) + self.op_type = "reduce_sum" + self.dtype = np.uint16 + self.x = np.random.uniform(0, 0.1, (2, 5, 10)).astype(np.float32) + self.attrs = {'dim': [0, 1, 2]} + self.out = self.x.sum(axis=tuple(self.attrs['dim'])) + self.gradient = self.calc_gradient() + + self.inputs = {'X': convert_float_to_uint16(self.x)} + self.outputs = {'Out': convert_float_to_uint16(self.out)} + self.gradient = self.calc_gradient() + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', user_defined_grads=self.gradient) + + def calc_gradient(self): + x = self.x + grad = np.ones(x.shape, dtype=x.dtype) + return [grad] + + class TestSumOp_fp16_withInt(OpTest): def setUp(self): self.op_type = "reduce_sum" diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py index c1ce032f506127e495dfd3231471fdabe6dfa26b..d432b8057f624831f40b8cd48a0ede694f8d0a55 100644 --- a/python/paddle/fluid/tests/unittests/test_scale_op.py +++ b/python/paddle/fluid/tests/unittests/test_scale_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle import paddle.fluid as fluid import paddle.fluid.core as core @@ -153,6 +153,23 @@ class TestScaleFp16Op(TestScaleOp): place, ["X"], "Out", max_relative_error=0.05) +class TestScaleBF16Op(OpTest): + def setUp(self): + self.op_type = "scale" + self.dtype = np.uint16 + self.attrs = {'scale': -2.3} + x = np.random.random((10, 10)).astype(np.float32) + out = x * np.float32(self.attrs['scale']) + self.inputs = {'X': convert_float_to_uint16(x)} + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', numeric_grad_delta=0.8) + + @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows): diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index eddccd4ff24f1a8b7c23bda3da813bc87c199cbe..7040145a76833588f0a5738b1b09e10061497e8c 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -298,6 +298,32 @@ def create_test_sum_fp16_class(parent): globals()[cls_name] = TestSumFp16Case +#----------- test bf16 ----------- +class TestSumBF16Op(OpTest): + def setUp(self): + self.op_type = "sum" + self.init_kernel_type() + x0 = np.random.random((3, 40)).astype(np.float32) + x1 = np.random.random((3, 40)).astype(np.float32) + x2 = np.random.random((3, 40)).astype(np.float32) + y = x0 + x1 + x2 + self.inputs = { + "X": [("x0", convert_float_to_uint16(x0)), + ("x1", convert_float_to_uint16(x1)), + ("x2", convert_float_to_uint16(x2))] + } + self.outputs = {'Out': convert_float_to_uint16(y)} + + def init_kernel_type(self): + self.dtype = np.uint16 + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', numeric_grad_delta=0.5) + + class API_Test_Add_n(unittest.TestCase): def test_api(self): with fluid.program_guard(fluid.Program(), fluid.Program()): diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index e7c3cfbb7b93b5deffb95e9ee175a7a03d1aaf7f..cc33a909632766e81bfabdb73cc3a1e177c1fe1a 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -171,10 +171,7 @@ class DistributedFusedLamb(Optimizer): moment2.is_distributed = True beta1pow = self._create_persistable_var('beta1pow') beta2pow = self._create_persistable_var('beta2pow') - fused_indices = self._create_persistable_var( - 'fused_indices', dtype='int32') - weight_decay = self._create_persistable_var('weight_decay') - weight_decay.is_distributed = True + param_info = self._create_persistable_var('param_info', dtype='int32') param_info.is_distributed = True @@ -189,17 +186,20 @@ class DistributedFusedLamb(Optimizer): 'fp16_partial_fused_offsets', dtype='int32') fp16_partial_fused_offsets.is_distributed = True + param_order = self._create_persistable_var('param_order', dtype='int32') + param_order.is_distributed = True + rank = get_rank() nranks = get_world_size() scale = self._get_or_create_scale() params = [p for p, _ in params_grads] grads = [g for _, g in params_grads] - weight_decay_values = [self._weight_decay] * len(params) + apply_weight_decay = [1] * len(params) if self._exclude_from_weight_decay_fn is not None: for i, p in enumerate(params): if self._exclude_from_weight_decay_fn(p): - weight_decay_values[i] = 0.0 + apply_weight_decay[i] = 0 startup_block = self.helper.startup_program.global_block() for g in grads: @@ -225,8 +225,6 @@ class DistributedFusedLamb(Optimizer): 'Moment2': [moment2], 'Beta1Pow': [beta1pow], 'Beta2Pow': [beta2pow], - 'FusedIndices': [fused_indices], - 'WeightDecay': [weight_decay], 'GlobalScale': [scale], 'ParamInfo': [param_info], 'ParamOut': params, @@ -235,12 +233,13 @@ class DistributedFusedLamb(Optimizer): 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets], 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets], 'FusedParamOffsets': [fused_offsets], + 'ParamOrder': [param_order], }, attrs={ 'alignment': self._alignment, 'rank': rank, 'nranks': nranks, - 'weight_decay': weight_decay_values, + 'apply_weight_decay': apply_weight_decay, 'moment1': 0.0, 'moment2': 0.0, 'beta1': self._beta1, @@ -272,8 +271,6 @@ class DistributedFusedLamb(Optimizer): 'Moment2': [moment2], 'Beta1Pow': [beta1pow], 'Beta2Pow': [beta2pow], - 'FusedIndices': [fused_indices], - 'WeightDecay': [weight_decay], 'GlobalScale': [scale], 'ParamInfo': [param_info], 'Param': params, @@ -281,6 +278,7 @@ class DistributedFusedLamb(Optimizer): 'FusedParamOffsets': [fused_offsets], 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets], 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets], + 'ParamOrder': [param_order], }, outputs={ 'FP32FusedParamOut': [fp32_fused_param], @@ -294,6 +292,7 @@ class DistributedFusedLamb(Optimizer): 'FoundInf': [self._found_inf], }, attrs={ + 'weight_decay': self._weight_decay, 'beta1': self._beta1, 'beta2': self._beta2, 'epsilon': self._epsilon, diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index e59ef5ebfb0ab26c16c78933733bc11c0c4148d0..e6efde836284ac361f9781a0cb18b0df72afe354 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1667,11 +1667,11 @@ def cross_entropy(input, label_min = paddle.min(valid_label) label_max = paddle.max(valid_label) if label_min < 0: - raise ValueError("label should not out of bound, but got{}". - format(label_min)) + raise ValueError("Target {} is out of lower bound.".format( + label_min.item())) if label_max >= input.shape[axis]: - raise ValueError("label should not out of bound, but got{}". - format(label_max)) + raise ValueError("Target {} is out of upper bound.".format( + label_max.item())) if core.is_compiled_with_npu() or core.is_compiled_with_mlu(): _, _, out = _C_ops.softmax_with_cross_entropy( input, label, 'soft_label', soft_label, 'ignore_index', diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 5fc9dfe3f6499701f75fffc62bdcf3f9a0c28821..cfd817c24c7367f69673353a8aaceeedec506e15 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -451,7 +451,20 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. param_code = "" for param in infer_meta_params: if param in input_names: - if param in self.optional_vars: + if self.inputs['input_info'][param] == "const Tensor&": + param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), " + elif self.inputs['input_info'][ + param] == "const std::vector&": + meta_tensor_code = meta_tensor_code + f""" +{code_indent} auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param}); +{code_indent} std::vector {param}_metas({param}_meta_vec.size()); +{code_indent} for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{ +{code_indent} {param}_metas[i] = &{param}_meta_vec[i]; +{code_indent} }} +""" + + param_code = param_code + param + "_metas, " + elif param in self.optional_vars: meta_tensor_code = meta_tensor_code + f""" {code_indent} paddle::optional {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none); {code_indent} auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param}); @@ -461,7 +474,9 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, " else: - param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), " + raise ValueError( + f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported." + ) elif param in kernel_output_names: meta_tensor_code = meta_tensor_code + code_indent + " phi::MetaTensor " + param.replace( 'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n" diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh index 80efd32ecf14eebac990dd8a531c134e95e7c039..1db79418b2d8f296c37d7757cace7b7bc2a8141c 100644 --- a/tools/ci_op_benchmark.sh +++ b/tools/ci_op_benchmark.sh @@ -106,7 +106,7 @@ function prepare_benchmark_environment { [ $? -ne 0 ] && LOG "[FATAL] Clone benchmark repo fail." && exit -1 LOG "[INFO] Collect api info ..." python benchmark/api/deploy/collect_api_info.py \ - --test_module_name tests_v2 \ + --test_module_name dynamic_tests_v2 \ --info_file api_info.txt >& 2 [ $? -ne 0 ] && LOG "[FATAL] Collect api info fail." && exit -1 [ ! -f benchmark/ci/scripts/op_benchmark.config ] && LOG "[FATAL] Missing op_benchmark.config!" && exit -1 @@ -185,7 +185,7 @@ function run_op_benchmark_test { logs_dir="$(pwd)/logs-${branch_name}" [ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir pushd benchmark/api > /dev/null - bash deploy/main_control.sh tests_v2 \ + bash deploy/main_control.sh dynamic_tests_v2 \ tests_v2/configs \ $logs_dir \ $VISIBLE_DEVICES \ @@ -212,7 +212,7 @@ function check_op_benchmark_result { # there is no need to recompile and install paddle LOG "[INFO] retry ${retry_time} times ..." pushd benchmark/api > /dev/null - bash deploy/main_control.sh tests_v2 \ + bash deploy/main_control.sh dynamic_tests_v2 \ tests_v2/configs \ ${logs_dir} \ $VISIBLE_DEVICES \ diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 4df27bfe4e923868ac5267119c5b56b6ba3839c8..7f8e516496f32352fa18f950a4687d5b52f4d10d 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -958,7 +958,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [ 'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', 'test_pow', 'test_inplace_softmax_with_cross_entropy', 'test_transforms', 'test_unfold_op', 'test_assign_op', 'test_isinstance', - 'test_conv_affine_channel_fuse_pass', 'auto_growth_best_fit_allocator_facade_test', 'test_cholesky_op', 'test_adaptive_avg_pool3d', 'test_paddle_save_load_binary', 'test_fused_fc_elementwise_layernorm_op', 'test_sequence_enumerate_op', @@ -1873,7 +1872,6 @@ TETRAD_PARALLEL_JOB = [ 'test_dataloader_unkeep_order', 'test_parallel_executor_profiler', 'test_correlation', - 'test_conv_affine_channel_fuse_pass', 'test_ir_inplace_pass', 'test_moving_average_abs_max_scale_op', 'test_flatten_contiguous_range_op', diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 694283264ca8f63ec3bcbe73a884c6a9f280bc15..7356f0c8db02551c930e424571cf779f0c3dbc9c 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -578,7 +578,6 @@ STATIC_MODE_TESTING_LIST = [ 'test_ir_embedding_eltwise_layernorm_fuse_pass', 'test_ir_fc_fuse_pass', 'test_ir_skip_layernorm_pass', - 'test_conv_affine_channel_fuse_pass', 'test_conv_bias_mkldnn_fuse_pass', 'test_conv_bn_fuse_pass', 'test_conv_elementwise_add2_act_fuse_pass',