提交 a4bccde0 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_sgd_to_phi

...@@ -36,7 +36,7 @@ ENDIF() ...@@ -36,7 +36,7 @@ ENDIF()
if(NOT DEFINED XPU_BASE_URL) if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219") SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
else() else()
SET(XPU_BASE_URL "${XPU_BASE_URL}") SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif() endif()
......
...@@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME) ...@@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME)
else() else()
xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS}) xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
find_fluid_modules(${TARGET_NAME}) find_fluid_modules(${TARGET_NAME})
find_phi_modules(${TARGET_NAME})
endif() endif()
if (xpu_library_DEPS) if (xpu_library_DEPS)
add_dependencies(${TARGET_NAME} ${xpu_library_DEPS}) add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
......
...@@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST) ...@@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST)
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./gpudnn\/") elseif (${kernel_path} MATCHES "./gpudnn\/")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n") file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./kps\/")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
else () else ()
# deal with device independent kernel, now we use CPU temporaary # deal with device independent kernel, now we use CPU temporaary
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
...@@ -97,6 +99,7 @@ function(kernel_library TARGET) ...@@ -97,6 +99,7 @@ function(kernel_library TARGET)
set(gpu_srcs) set(gpu_srcs)
set(xpu_srcs) set(xpu_srcs)
set(gpudnn_srcs) set(gpudnn_srcs)
set(kps_srcs)
set(selected_rows_srcs) set(selected_rows_srcs)
# parse and save the deps kerenl targets # parse and save the deps kerenl targets
set(all_srcs) set(all_srcs)
...@@ -128,6 +131,9 @@ function(kernel_library TARGET) ...@@ -128,6 +131,9 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
endif() endif()
...@@ -137,6 +143,15 @@ function(kernel_library TARGET) ...@@ -137,6 +143,15 @@ function(kernel_library TARGET)
list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
endif() endif()
endif() endif()
if (WITH_XPU_KP)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
# Change XPU2 file suffix
# NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps)
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
endif()
endif()
else() else()
# TODO(chenweihang): impl compile by source later # TODO(chenweihang): impl compile by source later
endif() endif()
...@@ -150,6 +165,7 @@ function(kernel_library TARGET) ...@@ -150,6 +165,7 @@ function(kernel_library TARGET)
list(APPEND all_srcs ${gpu_srcs}) list(APPEND all_srcs ${gpu_srcs})
list(APPEND all_srcs ${xpu_srcs}) list(APPEND all_srcs ${xpu_srcs})
list(APPEND all_srcs ${gpudnn_srcs}) list(APPEND all_srcs ${gpudnn_srcs})
list(APPEND all_srcs ${kps_srcs})
foreach(src ${all_srcs}) foreach(src ${all_srcs})
file(READ ${src} target_content) file(READ ${src} target_content)
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
...@@ -159,11 +175,11 @@ function(kernel_library TARGET) ...@@ -159,11 +175,11 @@ function(kernel_library TARGET)
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
endif() endif()
foreach(include_kernel ${include_kernels}) foreach(include_kernel ${include_kernels})
if ("${kernel_library_SUB_DIR}" STREQUAL "") if ("${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
else() else()
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
endif() endif()
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND kernel_deps ${kernel_name}) list(APPEND kernel_deps ${kernel_name})
endforeach() endforeach()
...@@ -176,11 +192,20 @@ function(kernel_library TARGET) ...@@ -176,11 +192,20 @@ function(kernel_library TARGET)
list(LENGTH gpu_srcs gpu_srcs_len) list(LENGTH gpu_srcs gpu_srcs_len)
list(LENGTH xpu_srcs xpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH gpudnn_srcs gpudnn_srcs_len) list(LENGTH gpudnn_srcs gpudnn_srcs_len)
list(LENGTH kps_srcs kps_srcs_len)
list(LENGTH selected_rows_srcs selected_rows_srcs_len) list(LENGTH selected_rows_srcs selected_rows_srcs_len)
# kernel source file level
# level 1: base device kernel
# - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
# level 2: device-independent kernel
# - common_srcs
# level 3: Kernel implemented by reusing device-independent kernel
# - selected_rows_srcs
# Build Target according different src organization # Build Target according different src organization
if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
(${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
# If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
if (WITH_GPU) if (WITH_GPU)
...@@ -193,6 +218,11 @@ function(kernel_library TARGET) ...@@ -193,6 +218,11 @@ function(kernel_library TARGET)
hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif() endif()
elseif (WITH_XPU_KP)
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
else() else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
...@@ -200,7 +230,7 @@ function(kernel_library TARGET) ...@@ -200,7 +230,7 @@ function(kernel_library TARGET)
endif() endif()
endif() endif()
# If there are only specific device srcs, build target using this rule. # If there are only specific device srcs, build target using this rule.
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
...@@ -209,6 +239,10 @@ function(kernel_library TARGET) ...@@ -209,6 +239,10 @@ function(kernel_library TARGET)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif() endif()
elseif (WITH_XPU_KP)
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else() else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
...@@ -222,6 +256,9 @@ function(kernel_library TARGET) ...@@ -222,6 +256,9 @@ function(kernel_library TARGET)
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
else() else()
cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
...@@ -232,6 +269,8 @@ function(kernel_library TARGET) ...@@ -232,6 +269,8 @@ function(kernel_library TARGET)
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else() else()
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif() endif()
...@@ -240,6 +279,8 @@ function(kernel_library TARGET) ...@@ -240,6 +279,8 @@ function(kernel_library TARGET)
nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else() else()
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif() endif()
...@@ -249,7 +290,7 @@ function(kernel_library TARGET) ...@@ -249,7 +290,7 @@ function(kernel_library TARGET)
if (${target_build_flag} EQUAL 1) if (${target_build_flag} EQUAL 1)
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
# append target into PHI_KERNELS property # append target into PHI_KERNELS property
get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
...@@ -275,6 +316,9 @@ function(kernel_library TARGET) ...@@ -275,6 +316,9 @@ function(kernel_library TARGET)
if (${gpudnn_srcs_len} GREATER 0) if (${gpudnn_srcs_len} GREATER 0)
kernel_declare(${gpudnn_srcs}) kernel_declare(${gpudnn_srcs})
endif() endif()
if (${kps_srcs_len} GREATER 0)
kernel_declare(${kps_srcs})
endif()
if (${selected_rows_srcs_len} GREATER 0) if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs}) kernel_declare(${selected_rows_srcs})
endif() endif()
......
cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
if(WITH_NCCL) if(WITH_NCCL)
cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/reducer.h"
#include "paddle/phi/common/data_type.h"
namespace paddle {
namespace distributed {
std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
const std::vector<Tensor> tensors,
const std::vector<bool> &is_sparse_gradient,
const std::vector<size_t> &group_size_limits,
const std::vector<int64_t> &tensor_indices) {
PADDLE_ENFORCE_EQ(
tensors.size(), is_sparse_gradient.size(),
platform::errors::PreconditionNotMet(
"tensors len must be equal to is_sparse_gradient len, but "
"[%lu] != [%lu]",
tensors.size(), is_sparse_gradient.size()));
auto check_perm = [](const std::vector<int64_t> &x) -> bool {
size_t len = x.size();
std::vector<size_t> cnt(len, 0);
for (size_t i = 0; i < len; ++i) {
if (x[i] >= static_cast<int64_t>(len) || x[i] < 0 || cnt[x[i]]) {
return false;
}
cnt[x[i]]++;
}
return true;
};
PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices),
platform::errors::PreconditionNotMet(
"tensor_indices must be a permutation from 0 to %lu",
tensor_indices.size()));
// the return vector
std::vector<std::vector<size_t>> res;
// Key: the var type
// Value: should use which index in group_size_limits for group size limit
std::map<experimental::DataType, size_t> group_limit_index;
// Key: the var type
// Value: <the var index in input tensors, total numel in this group>
std::map<experimental::DataType, std::pair<std::vector<size_t>, size_t>>
next_group;
for (size_t i = 0; i < tensors.size(); ++i) {
const auto &var = tensors[i];
size_t tensor_real_index = i;
if (!tensor_indices.empty()) {
tensor_real_index = tensor_indices[i];
}
if (is_sparse_gradient[tensor_real_index]) {
// we keep sparse var a single group
res.push_back({tensor_real_index});
continue;
}
const auto &var_dtype = var.dtype();
VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype;
auto &group_info = next_group[var_dtype];
int64_t var_size = -1;
if (var.is_dense_tensor()) {
var_size =
std::dynamic_pointer_cast<phi::DenseTensor>(var.impl())->numel();
} else {
VLOG(3) << "var " << var.name()
<< " is not tensor or selected_rows, so skip it";
continue;
}
group_info.first.push_back(tensor_real_index);
group_info.second += experimental::SizeOf(var_dtype) * var_size;
// group_info.second += framework::SizeOfType(var_dtype) * var_size;
if (group_limit_index.find(var_dtype) == group_limit_index.end()) {
// means it is the first var of var_dtype
group_limit_index[var_dtype] = 0;
}
auto &cur_limit_index = group_limit_index[var_dtype];
if (group_info.second >= group_size_limits[cur_limit_index]) {
// exceed group capacity and create a new group
res.emplace_back(std::move(group_info.first));
group_info = std::pair<std::vector<size_t>, size_t>();
cur_limit_index =
(std::min)(cur_limit_index + 1, group_size_limits.size() - 1);
}
}
// add the final groups
for (auto &e : next_group) {
auto &group_info = e.second;
if (!group_info.first.empty()) {
res.emplace_back(std::move(group_info.first));
}
}
for (const auto &group_index : res) {
PADDLE_ENFORCE_NE(
group_index.empty(), true,
platform::errors::PreconditionNotMet(
"AssignGroupBySize construct empty group, please check."));
}
if (tensor_indices.empty()) {
std::sort(res.begin(), res.end(),
[](const std::vector<size_t> &x, const std::vector<size_t> &y) {
return x.front() < y.front();
});
}
return res;
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -14,41 +14,19 @@ ...@@ -14,41 +14,19 @@
#pragma once #pragma once
#include <string> #include <map>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle { namespace paddle {
namespace framework { namespace distributed {
namespace ir { using Tensor = paddle::experimental::Tensor;
/*
* Fuse the Conv and ConvAffineChannel.
*/
class Graph;
class ConvAffineChannelFusePass : public FusePassBase {
public:
ConvAffineChannelFusePass();
virtual ~ConvAffineChannelFusePass() {}
protected:
void ApplyImpl(ir::Graph*) const override;
const std::string name_scope_{"conv_affine_channel_fuse"};
};
class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
public:
ConvEltwiseAddAffineChannelFusePass();
virtual ~ConvEltwiseAddAffineChannelFusePass() {}
protected: std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
void ApplyImpl(ir::Graph*) const override; const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; const std::vector<size_t>& group_size_limits,
}; const std::vector<int64_t>& tensor_indices = {});
} // namespace ir } // namespace distributed
} // namespace framework } // namespace paddle
} // namespace paddle
...@@ -308,22 +308,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, ...@@ -308,22 +308,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
// TODO(chenweihang): support multiple inputs and outputs later // TODO(chenweihang): support multiple inputs and outputs later
phi::InferMetaContext infer_mete_context; phi::InferMetaContext infer_mete_context;
for (auto& in_name : input_names) { for (auto& in_name : input_names) {
if (ctx->HasInput(in_name)) { if (ctx->HasInputs(in_name)) {
infer_meta_context.EmplaceBackInput(std::make_shared<CompatMetaTensor>( auto input_var = ctx->GetInputVarPtrs(in_name);
ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime())); if (input_var.size() == 1) {
infer_meta_context.EmplaceBackInput(
std::make_shared<CompatMetaTensor>(input_var[0], ctx->IsRuntime()));
} else {
paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> inputs;
inputs.reserve(input_var.size());
for (const auto& in : input_var) {
inputs.push_back(
std::make_shared<CompatMetaTensor>(in, ctx->IsRuntime()));
}
infer_meta_context.EmplaceBackInputs(std::move(inputs));
}
} else { } else {
infer_meta_context.EmplaceBackInput({nullptr}); infer_meta_context.EmplaceBackInput({nullptr});
} }
} }
for (auto& out_name : output_names) {
if (ctx->HasOutput(out_name)) {
infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
} else {
infer_meta_context.EmplaceBackOutput({nullptr});
}
}
auto attr_reader = ctx->Attrs(); auto attr_reader = ctx->Attrs();
for (size_t i = 0; i < attr_names.size(); ++i) { for (size_t i = 0; i < attr_names.size(); ++i) {
auto attr_name = attr_names[i]; auto attr_name = attr_names[i];
...@@ -348,13 +351,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, ...@@ -348,13 +351,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
} }
} else { } else {
// If is not in runtime, we will set default value(-1) for ScalarArray // If is not in runtime, we will set default value(-1) for ScalarArray
int64_t num_ele = 0;
std::vector<VarDesc*> vars; std::vector<VarDesc*> vars;
vars.reserve(infershape_inputs.size()); vars.reserve(infershape_inputs.size());
for (size_t i = 0; i < infershape_inputs.size(); i++) { for (size_t i = 0; i < infershape_inputs.size(); ++i) {
vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i])); vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
} }
int64_t num_ele = 0;
if (vars.size() == 1) { if (vars.size() == 1) {
num_ele = 1; num_ele = 1;
const auto& tensor_dims = vars[0]->GetShape(); const auto& tensor_dims = vars[0]->GetShape();
...@@ -362,16 +365,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, ...@@ -362,16 +365,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
num_ele *= tensor_dims[i]; num_ele *= tensor_dims[i];
} }
} else { } else {
for (auto& var : vars) { num_ele = vars.size();
const auto& tensor_dims = var->GetShape();
PADDLE_ENFORCE_EQ(tensor_dims.size(), 1,
platform::errors::InvalidArgument(
"The shape is constructed by multi-tensor, "
"every tensor's dims should be 1. But your "
"shape has tensor that dims is %s.",
tensor_dims.size()));
num_ele += tensor_dims[0];
}
} }
phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1)); phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
tensor_attr.SetFromTensor(true); tensor_attr.SetFromTensor(true);
...@@ -383,10 +377,14 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, ...@@ -383,10 +377,14 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
std::type_index(typeid(std::vector<int32_t>))) { std::type_index(typeid(std::vector<int32_t>))) {
infer_meta_context.EmplaceBackAttr(std::move( infer_meta_context.EmplaceBackAttr(std::move(
phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr)))); phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(int))) {
infer_meta_context.EmplaceBackAttr(
phi::ScalarArray({BOOST_GET_CONST(int, attr)}));
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported cast op attribute `%s` to ScalarArray when " "Unsupported cast op attribute `%s` to ScalarArray when "
"construct KernelContext.", "construct InferMetaContext.",
attr_name)); attr_name));
} }
} }
...@@ -414,7 +412,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, ...@@ -414,7 +412,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
} }
} else if (ctx->HasInput(attr_name)) { } else if (ctx->HasInput(attr_name)) {
const auto& infershape_input = ctx->GetInputVarPtrs(attr_name); const auto& infershape_input = ctx->GetInputVarPtrs(attr_name);
if (infershape_input.size() == 1) { if (infershape_input.size() == 1) {
if (ctx->IsRuntime()) { if (ctx->IsRuntime()) {
Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]); Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
...@@ -490,6 +487,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, ...@@ -490,6 +487,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
"Unsupported attribute type is received when call " "Unsupported attribute type is received when call "
"InferShapeFunctor.")); "InferShapeFunctor."));
} }
} else {
// do nothing
}
}
for (auto& out_name : output_names) {
if (ctx->HasOutputs(out_name)) {
auto output_var = ctx->GetOutputVarPtrs(out_name);
if (output_var.size() == 1) {
infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
output_var[0], ctx->IsRuntime()));
} else {
paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> outputs;
outputs.reserve(output_var.size());
for (const auto& out : output_var) {
outputs.emplace_back(
std::make_shared<CompatMetaTensor>(out, ctx->IsRuntime()));
}
infer_meta_context.EmplaceBackOutputs(std::move(outputs));
}
} else {
infer_meta_context.EmplaceBackOutput({nullptr});
} }
} }
......
...@@ -78,7 +78,6 @@ pass_library(is_test_pass base) ...@@ -78,7 +78,6 @@ pass_library(is_test_pass base)
pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base) pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base) pass_library(sync_batch_norm_pass base)
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
#include <cmath>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace ir {
class Node;
#define GET_CONV_BN_NODES(pattern_name) \
/* OPERATORS */ \
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
/* CONV inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \
/* CONV outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \
/* Affine Channel inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \
/* Affine channel outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
const ir::Node& ac_scale,
const LoDTensor& ac_bias_tensor,
LoDTensor* eltwise_y_in_tensor) {
using EigenVectorArrayMap =
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using EigenMatrixArrayMap = Eigen::Map<
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
// Re-compute bias of conv2d from AffineChannel
PADDLE_ENFORCE_EQ(
eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
platform::errors::InvalidArgument(
"Tensor elementwise y(%d) and activation bias(%d) must have same "
"dimension.",
eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
scale_tensor->numel(), 1);
ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
ac_bias_tensor.numel(), 1);
EigenVectorArrayMap eltwise_y_in_array(
eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 1);
eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
// Re-compute weight of conv2d from AffineChannel
auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
auto weights_shape = weights->dims();
auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
weights_shape_2d[1]);
weights_array_2d.colwise() *= scale_array;
// Check for subnormal values that slows down convolution execution
for (int i = 0; i < weights->numel(); ++i) {
if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
}
}
ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
AddOpCompat(OpCompat("conv2d"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("Filter")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddInput("ResidualData")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Output")
.IsTensor()
.End()
.AddAttr("strides")
.IsType<std::vector<int>>()
.End()
.AddAttr("paddings")
.IsType<std::vector<int>>()
.End()
.AddAttr("padding_algorithm")
.IsOptional()
.IsStringIn({"EXPLICIT", "SAME", "VALID"})
.End()
.AddAttr("groups")
.IsNumGE(1)
.End()
.AddAttr("dilations")
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("affine_channel"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Scale")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("data_layout")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("axis")
.IsNumEQ(1)
.End();
}
void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
FusePassBase::Init(name_scope_, graph);
auto* scope = param_scope();
PADDLE_ENFORCE_NOT_NULL(
scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
if (!IsCompat(subgraph, g)) {
LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed.";
return;
}
VLOG(4) << "handle ConvAffineChannel fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
if (data_format == "AnyLayout") {
LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
"it's wrong if data_format of conv is not "
"NCHW.";
}
// Get affine_channel bias for resizing eltwise_y!
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
// Create eltwise_y (conv bias) variable
VarDesc eltwise_y_in_desc(
patterns::PDNodeName(name_scope_, "eltwise_y_in"));
// Set shape && datatype manually
eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
eltwise_y_in_desc.SetDataType(
framework::TransToProtoVarType(ac_bias_tensor->dtype()));
eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
eltwise_y_in_desc.SetPersistable(true);
// Initialize eltwise_y
auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
auto* eltwise_y_in_tensor =
scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 0.0f);
// update weights and biases
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// create an elementwise add node.
OpDesc desc;
desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
desc.SetType("elementwise_add");
desc.SetAttr("axis", 1);
desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
IR_NODE_LINK_TO(conv_out, eltwise_op);
IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
IR_NODE_LINK_TO(eltwise_op, ac_out);
found_conv_ac_count++;
};
gpd(graph, handler);
AddStatis(found_conv_ac_count);
}
ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
AddOpCompat(OpCompat("conv2d"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("Filter")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddInput("ResidualData")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Output")
.IsTensor()
.End()
.AddAttr("strides")
.IsType<std::vector<int>>()
.End()
.AddAttr("paddings")
.IsType<std::vector<int>>()
.End()
.AddAttr("padding_algorithm")
.IsOptional()
.IsStringIn({"EXPLICIT", "SAME", "VALID"})
.End()
.AddAttr("groups")
.IsNumGE(1)
.End()
.AddAttr("dilations")
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("affine_channel"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Scale")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("data_layout")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("axis")
.IsNumEQ(1)
.End();
}
void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
FusePassBase::Init(name_scope_, graph);
auto* scope = param_scope();
PADDLE_ENFORCE_NOT_NULL(
scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
if (!IsCompat(subgraph, g)) {
LOG(WARNING)
<< "ConvEltwiseAddAffineChannelFusePass in op compat failed.";
return;
}
VLOG(4) << "handle ConvBN fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
if (data_format == "AnyLayout") {
LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is "
"enabled, it's wrong if data_format of conv "
"is not NCHW.";
}
// OPERATORS
GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
// BIAS inputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
// BIAS outputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
// Get eltwise_y (conv bias) variable
auto* eltwise_y_in_tensor =
scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
// Get batch norm bias
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// Update the elementwise_add node
eltwise->Op()->SetAttr("axis", 1);
eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
GraphSafeRemoveNodes(graph,
{ac_scale, ac_bias, affine_channel, eltwise_out});
IR_NODE_LINK_TO(eltwise, ac_out);
found_conv_ac_count++;
};
gpd(graph, handler);
AddStatis(found_conv_ac_count);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_affine_channel_fuse_pass,
paddle::framework::ir::ConvAffineChannelFusePass);
REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.LE("conv2d", 1)
.EQ("affine_channel", 0));
REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.LE("conv2d", 1)
.LE("elementwise_add", 1)
.EQ("affine_channel", 0));
...@@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
} }
pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
} }
VLOG(4) << "Done inputs";
for (size_t i = 0; i < output_names.size(); ++i) { for (size_t i = 0; i < output_names.size(); ++i) {
auto it = ctx.outputs.find(output_names[i]); auto it = ctx.outputs.find(output_names[i]);
...@@ -2107,17 +2108,12 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2107,17 +2108,12 @@ void OperatorWithKernel::BuildPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.", "Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type()))); framework::ToTypeName(var->Type())));
} }
experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
output_defs.at(i));
SetAllocationForOutputTenosr(
tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
} }
pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i); pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
} }
VLOG(4) << "Done outputs";
for (size_t i = 0; i < attr_names.size(); ++i) { for (size_t i = 0; i < attr_names.size(); ++i) {
if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
...@@ -2226,6 +2222,7 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2226,6 +2222,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
} }
} }
} }
VLOG(4) << "Done attributes";
} }
} // namespace framework } // namespace framework
......
...@@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) { ...@@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
library_type = LibraryType::kMKLDNN; library_type = LibraryType::kMKLDNN;
} else if (kernel_key.backend() == phi::Backend::GPUDNN) { } else if (kernel_key.backend() == phi::Backend::GPUDNN) {
library_type = LibraryType::kCUDNN; library_type = LibraryType::kCUDNN;
} else if (kernel_key.backend() == phi::Backend::KPS) {
library_type = LibraryType::kKP;
} else { } else {
// do nothing // do nothing
} }
...@@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey( ...@@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey(
backend = phi::Backend::MKLDNN; backend = phi::Backend::MKLDNN;
} else if (kernel_type.library_type_ == LibraryType::kCUDNN) { } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
backend = phi::Backend::GPUDNN; backend = phi::Backend::GPUDNN;
} else if (kernel_type.library_type_ == LibraryType::kKP) {
backend = phi::Backend::KPS;
} else { } else {
// do // do
} }
...@@ -229,26 +233,5 @@ static void SetAllocationForUninitializedDenseTensor( ...@@ -229,26 +233,5 @@ static void SetAllocationForUninitializedDenseTensor(
dense_tensor->ResetHolder(shared_allocation); dense_tensor->ResetHolder(shared_allocation);
} }
void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
const platform::Place& place) {
if (phi::DenseTensor::classof(tensor)) {
auto* dense_tensor = static_cast<phi::DenseTensor*>(tensor);
if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) {
SetAllocationForUninitializedDenseTensor(dense_tensor, place);
}
} else if (phi::SelectedRows::classof(tensor)) {
auto* selected_rows = static_cast<phi::SelectedRows*>(tensor);
if (!selected_rows->value().IsInitialized() ||
!(selected_rows->place() == place)) {
SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(),
place);
}
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported tensor type is received when setting allocation for "
"output tensor."));
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -62,9 +62,6 @@ class KernelArgsNameMaker { ...@@ -62,9 +62,6 @@ class KernelArgsNameMaker {
void InitDefaultKernelSignatureMap(); void InitDefaultKernelSignatureMap();
void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
const platform::Place& place);
// TODO(Wilber): support others device context. // TODO(Wilber): support others device context.
template <typename T> template <typename T>
struct ConvertToPhiContext { struct ConvertToPhiContext {
......
...@@ -323,12 +323,6 @@ void BuildDygraphPhiKernelContext( ...@@ -323,12 +323,6 @@ void BuildDygraphPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.", "Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type()))); framework::ToTypeName(var->Type())));
} }
experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
output_defs.at(i));
framework::SetAllocationForOutputTenosr(
tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
} }
kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
......
...@@ -75,13 +75,11 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { ...@@ -75,13 +75,11 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
void PaddlePassBuilder::ClearPasses() { passes_.clear(); } void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
const std::vector<std::string> kTRTSubgraphPasses({ const std::vector<std::string> kTRTSubgraphPasses({
"conv_affine_channel_fuse_pass", // "adaptive_pool2d_convert_global_pass",
"adaptive_pool2d_convert_global_pass", "shuffle_channel_detect_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "quant_conv2d_dequant_fuse_pass", //
"shuffle_channel_detect_pass", // "delete_quant_dequant_op_pass", //
"quant_conv2d_dequant_fuse_pass", // "delete_quant_dequant_filter_op_pass", //
"delete_quant_dequant_op_pass", //
"delete_quant_dequant_filter_op_pass", //
// "fc_fuse_pass", // // "fc_fuse_pass", //
"simplify_with_basic_ops_pass", // "simplify_with_basic_ops_pass", //
"embedding_eltwise_layernorm_fuse_pass", // "embedding_eltwise_layernorm_fuse_pass", //
...@@ -134,22 +132,20 @@ const std::vector<std::string> kLiteSubgraphPasses({ ...@@ -134,22 +132,20 @@ const std::vector<std::string> kLiteSubgraphPasses({
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({ passes_.assign({
// "identity_scale_op_clean_pass", // // "identity_scale_op_clean_pass", //
"is_test_pass", // "is_test_pass", //
"simplify_with_basic_ops_pass", // "simplify_with_basic_ops_pass", //
"conv_affine_channel_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"conv_bn_fuse_pass", // "embedding_eltwise_layernorm_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "multihead_matmul_fuse_pass_v2", //
"embedding_eltwise_layernorm_fuse_pass", // "gpu_cpu_squeeze2_matmul_fuse_pass", //
"multihead_matmul_fuse_pass_v2", // "gpu_cpu_reshape2_matmul_fuse_pass", //
"gpu_cpu_squeeze2_matmul_fuse_pass", // "gpu_cpu_flatten2_matmul_fuse_pass", //
"gpu_cpu_reshape2_matmul_fuse_pass", // "gpu_cpu_map_matmul_v2_to_mul_pass", //
"gpu_cpu_flatten2_matmul_fuse_pass", // "gpu_cpu_map_matmul_v2_to_matmul_pass", //
"gpu_cpu_map_matmul_v2_to_mul_pass", // "gpu_cpu_map_matmul_to_mul_pass", //
"gpu_cpu_map_matmul_v2_to_matmul_pass", // "fc_fuse_pass", //
"gpu_cpu_map_matmul_to_mul_pass", // "fc_elementwise_layernorm_fuse_pass", //
"fc_fuse_pass", //
"fc_elementwise_layernorm_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be #if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7 // guaranteed at least v7
// cudnn8.0 has memory leak problem in conv + eltwise + act, so we // cudnn8.0 has memory leak problem in conv + eltwise + act, so we
...@@ -236,14 +232,12 @@ void CpuPassStrategy::EnableMKLDNN() { ...@@ -236,14 +232,12 @@ void CpuPassStrategy::EnableMKLDNN() {
passes_.insert(passes_.begin(), "mkldnn_placement_pass"); passes_.insert(passes_.begin(), "mkldnn_placement_pass");
for (auto &pass : std::vector<std::string>({ for (auto &pass : std::vector<std::string>({
"depthwise_conv_mkldnn_pass", // "depthwise_conv_mkldnn_pass", //
"conv_bn_fuse_pass", // Execute BN passes again to "conv_bn_fuse_pass", // Execute BN passes again to
"conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order
"conv_affine_channel_fuse_pass", // "conv_transpose_bn_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "conv_transpose_eltwiseadd_bn_fuse_pass", //
"conv_transpose_bn_fuse_pass", // "conv_bias_mkldnn_fuse_pass", //
"conv_transpose_eltwiseadd_bn_fuse_pass", //
"conv_bias_mkldnn_fuse_pass", //
"conv_transpose_bias_mkldnn_fuse_pass", "conv_transpose_bias_mkldnn_fuse_pass",
// TODO(baoachun): Need to support 5-dimensional input. // TODO(baoachun): Need to support 5-dimensional input.
// "conv3d_bias_mkldnn_fuse_pass", // // "conv3d_bias_mkldnn_fuse_pass", //
......
...@@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext; ...@@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext;
ops::CastOpKernel<CUDA, plat::complex<float>>, \ ops::CastOpKernel<CUDA, plat::complex<float>>, \
ops::CastOpKernel<CUDA, plat::complex<double>>, ##__VA_ARGS__); ops::CastOpKernel<CUDA, plat::complex<double>>, ##__VA_ARGS__);
#if !defined(PADDLE_WITH_HIP)
// See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel<CUDA, plat::bfloat16>) REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel<CUDA, plat::bfloat16>)
#else
REGISTER_CAST_CUDA_BASE(transfer_dtype)
#endif
...@@ -18,7 +18,9 @@ limitations under the License. */ ...@@ -18,7 +18,9 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/multiary.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h" #include "paddle/phi/kernels/funcs/concat_funcs.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
...@@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel { ...@@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat");
auto inputs_dims = ctx->GetInputsDim("X");
const size_t inputs_num = inputs_dims.size();
PADDLE_ENFORCE_GT(
inputs_num, static_cast<size_t>(0),
platform::errors::InvalidArgument(
"The number of input tensors in concat op should > 0. But "
"received inputs' length is 0."));
if (inputs_num == 1) {
VLOG(3) << "Warning: concat op have only one input, may waste memory";
}
if (ctx->HasInput("AxisTensor")) {
auto out_dims =
phi::make_ddim(std::vector<int>(inputs_dims[0].size(), -1));
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
} else {
size_t axis =
ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
static_cast<int64_t>(inputs_dims[0].size()));
framework::DDim out_dims =
phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis);
if (out_dims[axis] < 0) {
out_dims[axis] = -1;
}
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
...@@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
PT_INFER_META(phi::ConcatInferMeta));
REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
ops::ConcatGradOpMaker<paddle::framework::OpDesc>, ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
ops::ConcatGradOpMaker<paddle::imperative::OpBase>); ops::ConcatGradOpMaker<paddle::imperative::OpBase>,
ConcatInferShapeFunctor);
REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad, REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
ops::ConcatDoubleGradOpMaker<paddle::framework::OpDesc>, ops::ConcatDoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ConcatDoubleGradOpMaker<paddle::imperative::OpBase>, ops::ConcatDoubleGradOpMaker<paddle::imperative::OpBase>,
......
...@@ -20,5 +20,5 @@ else() ...@@ -20,5 +20,5 @@ else()
endif() endif()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n") file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n") file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
...@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h"
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp { ...@@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp {
::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$"); REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
paddle::operators::LogicalAndFunctor);
REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$"); REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
paddle::operators::LogicalOrFunctor);
REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
paddle::operators::LogicalNotFunctor);
REGISTER_BINARY_LOGICAL_OP(logical_xor, REGISTER_BINARY_LOGICAL_OP(logical_xor,
"$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$"); "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
paddle::operators::LogicalXorFunctor);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
namespace paddle {
namespace operators {
template <typename Functor>
class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using InT = typename Functor::ELEMENT_TYPE;
using OutT = bool;
auto functor = Functor();
std::vector<const framework::Tensor*> ins;
std::vector<framework::Tensor*> outs;
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
if (ins.size() == 1) {
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kUnary,
InT, OutT>(
cuda_ctx, ins, &outs, axis, functor);
} else {
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
InT, OutT>(
cuda_ctx, ins, &outs, axis, functor);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \
REGISTER_OP_CUDA_KERNEL( \
op_name, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int8_t>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int16_t>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int64_t>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor)
REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor)
REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor)
REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor)
#undef REGISTER_LOGICAL_CUDA_KERNEL
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <type_traits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
#define LOGICAL_BINARY_FUNCTOR(func_name, op) \
template <typename T> \
struct func_name { \
using ELEMENT_TYPE = T; \
HOSTDEVICE bool operator()(const T a, const T b) const { \
return static_cast<bool>(a) op static_cast<bool>(b); \
} \
};
LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
#undef LOGICAL_BINARY_FUNCTOR
template <typename T>
struct LogicalNotFunctor {
using ELEMENT_TYPE = T;
HOSTDEVICE bool operator()(const T a) const { return !a; }
};
template <typename DeviceContext, typename Functor>
class BinaryLogicalOpKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEMENT_TYPE;
auto* x = context.Input<framework::Tensor>("X");
auto* y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
Functor binary_func;
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
binary_func, out);
}
};
template <typename DeviceContext, typename Functor>
class UnaryLogicalOpKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEMENT_TYPE;
auto* x = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
Functor unary_func;
platform::Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), x->data<T>(),
x->data<T>() + x->numel(),
out->mutable_data<bool>(context.GetPlace()), unary_func);
}
};
} // namespace operators
} // namespace paddle
#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
REGISTER_OP_##dev##_KERNEL( \
op_type, ::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<bool>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int8_t>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int16_t>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int64_t>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<float>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<double>>);
#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
REGISTER_OP_##dev##_KERNEL( \
op_type, ::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<bool>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int8_t>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int16_t>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int64_t>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<float>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<double>>);
...@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle { namespace paddle {
......
...@@ -14,6 +14,10 @@ ...@@ -14,6 +14,10 @@
#include "paddle/fluid/operators/dot_op.h" #include "paddle/fluid/operators/dot_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel { ...@@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"),
platform::errors::PreconditionNotMet(
"Input(X) of DotOp should not be null."));
PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"),
platform::errors::PreconditionNotMet(
"Input(Y) of DotOp should not be null."));
PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"),
platform::errors::PreconditionNotMet(
"Output(Out) of DotOp should not be null."));
auto x_dims = ctx->GetInputDim("X");
auto x_rank = static_cast<size_t>(x_dims.size());
PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank,
platform::errors::PreconditionNotMet(
"ShapeError: The dimensions of input tensor X (%s) "
"should be 1 or 2",
x_dims.to_str()));
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(
true, x_rank == (size_t)y_dims.size(),
platform::errors::PreconditionNotMet(
"ShapeError: The shape of input tensor Y: %s should match with "
"input tenosr X: %s",
y_dims.to_str(), x_dims.to_str()));
bool shape_match = true;
for (size_t i = 0; i < x_rank; ++i) {
if (x_dims[i] != y_dims[i]) {
shape_match = false;
break;
}
}
PADDLE_ENFORCE_EQ(true, shape_match,
platform::errors::PreconditionNotMet(
"ShapeError: The shape of input tensor X: %s should "
"be exactly the same "
"with input tensor Y: %s",
x_dims.to_str(), y_dims.to_str()));
auto dims = vectorize(x_dims);
dims[dims.size() - 1] = 1;
ctx->SetOutputDim("Out", phi::make_ddim(dims));
}
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
...@@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
PT_INFER_META(phi::DotInferMeta));
REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker, REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
ops::DotOpGradMaker<paddle::framework::OpDesc>, ops::DotOpGradMaker<paddle::framework::OpDesc>,
ops::DotOpGradMaker<paddle::imperative::OpBase>); ops::DotOpGradMaker<paddle::imperative::OpBase>,
DotInferShapeFunctor);
REGISTER_OPERATOR(dot_grad, ops::DotGradOp); REGISTER_OPERATOR(dot_grad, ops::DotGradOp);
......
...@@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, ...@@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>, REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
ops::GatherOpKernel<double>, ops::GatherOpKernel<int>, ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
ops::GatherOpKernel<uint8_t>, ops::GatherOpKernel<uint8_t>,
ops::GatherOpKernel<int64_t>); ops::GatherOpKernel<int64_t>,
ops::GatherOpKernel<phi::dtype::bfloat16>);
REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>, REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
ops::GatherGradientOpKernel<double>, ops::GatherGradientOpKernel<double>,
ops::GatherGradientOpKernel<int>, ops::GatherGradientOpKernel<int>,
ops::GatherGradientOpKernel<uint8_t>, ops::GatherGradientOpKernel<uint8_t>,
ops::GatherGradientOpKernel<int64_t>); ops::GatherGradientOpKernel<int64_t>,
ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
REGISTER_OP_VERSION(gather) REGISTER_OP_VERSION(gather)
.AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
paddle::framework::compatible::OpVersionDesc().NewInput( paddle::framework::compatible::OpVersionDesc().NewInput(
......
...@@ -130,9 +130,11 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>, ...@@ -130,9 +130,11 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
ops::GatherOpCUDAKernel<double>, ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>, ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>, ops::GatherOpCUDAKernel<int>,
ops::GatherOpCUDAKernel<plat::float16>); ops::GatherOpCUDAKernel<plat::float16>,
ops::GatherOpCUDAKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
ops::GatherGradOpCUDAKernel<double>, ops::GatherGradOpCUDAKernel<double>,
ops::GatherGradOpCUDAKernel<int64_t>, ops::GatherGradOpCUDAKernel<int64_t>,
ops::GatherGradOpCUDAKernel<int>, ops::GatherGradOpCUDAKernel<int>,
ops::GatherGradOpCUDAKernel<plat::float16>); ops::GatherGradOpCUDAKernel<plat::float16>,
ops::GatherGradOpCUDAKernel<plat::bfloat16>);
...@@ -29,6 +29,7 @@ namespace operators { ...@@ -29,6 +29,7 @@ namespace operators {
using DataLayout = framework::DataLayout; using DataLayout = framework::DataLayout;
enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 }; enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
#define ALIGN_BYTES 16
#define CHECK_CASE(i, flags, kernel_name, ...) \ #define CHECK_CASE(i, flags, kernel_name, ...) \
if (i == flags) { \ if (i == flags) { \
...@@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { ...@@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
template <typename T> template <typename T>
__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
int imsize, int groups, int imsize, int groups,
int group_size, T* mean, T* var, int group_size, T* mean, T* var) {
const DataLayout data_layout) {
int gid = blockIdx.y; int gid = blockIdx.y;
int cid = blockIdx.x; int cid = blockIdx.x;
int bid = blockIdx.z; int bid = blockIdx.z;
...@@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, ...@@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
T x_mean = 0, x_var = 0; T x_mean = 0, x_var = 0;
for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
T val; T val;
if (data_layout == DataLayout::kNCHW) { int hid = imid / W;
val = x[(bid * C + ccid) * imsize + imid]; int wid = imid % W;
} else { val = x[(bid * H + hid) * W * C + wid * C + ccid];
int hid = imid / W;
int wid = imid % W;
val = x[(bid * H + hid) * W * C + wid * C + ccid];
}
x_mean += val; x_mean += val;
x_var += val * val; x_var += val * val;
} }
...@@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, ...@@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
} }
template <typename T, typename AccT, int VecSize>
__device__ __forceinline__ void ThreadReduce(const T* input, int size,
const int offset, AccT* mean,
AccT* var) {
using VecT = kps::details::VectorType<T, VecSize>;
int tid = threadIdx.x;
if (offset > 0) {
input -= offset;
size += offset;
if (tid >= offset) {
AccT temp = input[tid];
*mean += temp;
*var += temp * temp;
}
size -= blockDim.x;
input += blockDim.x;
}
int remain = size % (VecSize * blockDim.x);
T ins[VecSize];
VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
// vector part
for (; VecSize * tid < (size - remain); tid += blockDim.x) {
*ins_vec = reinterpret_cast<const VecT*>(input)[tid];
#pragma unroll
for (int i = 0; i < VecSize; ++i) {
AccT temp = ins[i];
*mean += temp;
*var += temp * temp;
}
}
// scalar part
tid = size - remain + threadIdx.x;
for (; tid < size; tid += blockDim.x) {
AccT temp = input[tid];
*mean += temp;
*var += temp * temp;
}
}
template <typename T>
__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
int i = blockIdx.x;
T x_mean = 0, x_var = 0;
for (int j = threadIdx.x; j < size; j += blockDim.x) {
T val;
val = x[i * size + j];
x_mean += val;
x_var += val * val;
}
x_mean /= size;
x_var /= size;
CudaAtomicAddWithWarp(&mean[i], x_mean);
CudaAtomicAddWithWarp(&var[i], x_var);
}
template <typename T, typename AccT, int VecSize>
__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
int size) {
int i = blockIdx.x;
AccT x_mean = static_cast<AccT>(0);
AccT x_var = static_cast<AccT>(0);
const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
x += i * size;
ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
x_mean, kps::AddFunctor<AccT>());
x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
x_var, kps::AddFunctor<AccT>());
__syncthreads();
if (threadIdx.x == 0) {
mean[i] = static_cast<T>(x_mean / size);
var[i] = static_cast<T>(x_var / size);
}
}
template <typename T, int flags> template <typename T, int flags>
__global__ void GroupNormForward(const T* x, const T* mean, const T* var, __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
const T* scale, const T* bias, int N, int C, const T* scale, const T* bias, int N, int C,
...@@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var, ...@@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
int H = imsize / W; int H = imsize / W;
int ccid = gid * group_size + cid; int ccid = gid * group_size + cid;
if (ccid >= C) return; if (ccid >= C) return;
T x_mean = mean[bid * groups + gid]; auto ng = bid * groups + gid;
T x_var = var[bid * groups + gid]; T x_mean = mean[ng];
T x_var = var[ng];
x_var = x_var - x_mean * x_mean; x_var = x_var - x_mean * x_mean;
T var_inv = 1.0 / sqrt(x_var + epsilon); T var_inv = rsqrt(x_var + epsilon);
if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var; if (cid == 0 && threadIdx.x == 0) {
real_var[ng] = x_var;
}
for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
T val; T val;
int hid, wid; int hid, wid;
int index = (bid * C + ccid) * imsize + imid;
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
val = x[(bid * C + ccid) * imsize + imid]; val = x[index];
} else { } else {
hid = imid / W; hid = imid / W;
wid = imid % W; wid = imid % W;
val = x[(bid * H + hid) * W * C + wid * C + ccid]; val = x[(bid * H + hid) * W * C + wid * C + ccid];
} }
val = (val - x_mean) * var_inv; val = (val - x_mean) * var_inv;
if (flags & kHasScale) val *= scale[gid * group_size + cid]; if (flags & kHasScale) {
if (flags & kHasBias) val += bias[gid * group_size + cid]; val *= scale[ccid];
}
if (flags & kHasBias) {
val += bias[ccid];
}
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
y[(bid * C + ccid) * imsize + imid] = val; y[index] = val;
} else { } else {
y[(bid * H + hid) * W * C + wid * C + ccid] = val; y[(bid * H + hid) * W * C + wid * C + ccid] = val;
} }
...@@ -182,16 +266,41 @@ class GroupNormKernel<platform::CUDADeviceContext, T> ...@@ -182,16 +266,41 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
imsize *= x_dims[i]; imsize *= x_dims[i];
} }
} }
#ifdef __HIPCC__ #ifdef __HIPCC__
int block_size = std::max(std::min(256, imsize), 64); int block_size = std::max(std::min(256, imsize), 64);
#else #else
int block_size = std::min(1024, imsize); int block_size = std::min(1024, imsize);
#endif #endif
dim3 grid(group_size, groups, x_dims[0]); dim3 grid(group_size, groups, x_dims[0]);
dim3 threads(block_size, 1, 1); dim3 threads(block_size, 1, 1);
GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>( if (data_layout == DataLayout::kNCHW) {
x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data, using AccT = typename details::MPTypeTrait<T>::Type;
temp_var_data, data_layout); constexpr int vec_size = sizeof(float4) / sizeof(T);
int size = group_size * imsize;
const int max_num_threads = 1024;
int max_block_size = std::min(size / vec_size, max_num_threads);
int block_size_nchw = 1;
while (block_size_nchw < max_block_size) {
block_size_nchw *= 2;
}
block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
dim3 grids(x_dims[0] * groups);
dim3 blocks(block_size_nchw);
if (size < vec_size) {
ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
x_data, mean_data, temp_var_data, size);
} else {
VectorizedGetMeanAndVarNCHW<
T, AccT, vec_size><<<grids, blocks, 0, dev_ctx.stream()>>>(
x_data, mean_data, temp_var_data, size);
}
} else {
GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
temp_var_data);
}
int flags = int flags =
(scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data, UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,
......
...@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/index_sample_op.h"
#include <vector> #include <vector>
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker { class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
class IndexSampleOp : public framework::OperatorWithKernel { class IndexSampleOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Inputs(Input) of FindByIndex should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
platform::errors::InvalidArgument(
"Inputs(Index) of FindByIndex should not be null."));
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
input_dims.size(), 2,
platform::errors::InvalidArgument(
"Inputs(X) shape of IndexSample op should be 2-D, but "
"got X's shape = [%s], please check X shape.",
input_dims));
auto index_dims = ctx->GetInputDim("Index");
PADDLE_ENFORCE_EQ(
input_dims.size(), 2,
platform::errors::InvalidArgument(
"Inputs(Index) shape of IndexSample op should be 2-D, but "
"got Index's shape [%s] , please check index shape.",
input_dims));
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0],
platform::errors::InvalidArgument(
"Inputs(X)'s value of dimension 0 must same with "
"Inputs(Index)'s value of dimension 0, but "
"got %d of Inputs(X), and got %d of Inputs(Index), "
"please check Inputs shape.",
input_dims[0], index_dims[0]));
}
ctx->SetOutputDim("Out", index_dims);
auto type = ctx->GetInputsVarType("Index")[0];
if (type == framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("Index", /*->*/ "Out");
}
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
...@@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X"); ...@@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
PT_INFER_META(phi::IndexSampleInferMeta));
REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker, REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
ops::IndexSampleGradMaker<paddle::framework::OpDesc>, ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
ops::IndexSampleGradMaker<paddle::imperative::OpBase>); ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
IndexSampleInferShapeFunctor);
REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp, REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp,
ops::IndexSampleGradNoNeedBufferVarInferer); ops::IndexSampleGradNoNeedBufferVarInferer);
REGISTER_OP_CPU_KERNEL(
index_sample,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
index_sample_grad,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/index_sample_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#define PREDEFINED_BLOCK_SIZE_X 512
#define PREDEFINED_BLOCK_SIZE 1024
#define MIN(a, b) ((a) < (b) ? (a) : (b))
namespace paddle {
namespace operators {
namespace {
void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
.GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
}
}
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, typename IndexT = int>
__global__ void IndexSampleForward(const IndexT* index, const T* in_data,
T* out_data, size_t index_length,
size_t input_length, size_t batch_size) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
}
}
}
template <typename T, typename IndexT = int>
__global__ void IndexSampleGrad(const IndexT* index, T* in_grad,
const T* out_grad, size_t index_length,
size_t input_length, size_t batch_size,
bool same_data_in_row = true) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
if (same_data_in_row) {
platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
out_grad[sample_idx]);
} else {
in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
}
}
}
}
template <typename T>
class IndexSampleKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<LoDTensor>("X");
auto* index = ctx.Input<LoDTensor>("Index");
auto* output = ctx.Output<LoDTensor>("Out");
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
const auto* in_data = input->data<T>();
auto* out_data = output->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
auto input_dim = input->dims();
auto index_dim = index->dims();
size_t batch_size = input_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
auto block_width = platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
int block_height =
platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length,
batch_size);
} else if (index_type == framework::proto::VarType::INT32) {
const int* index_data = index->data<int>();
IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length,
batch_size);
}
}
};
template <typename T>
class IndexSampleGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* output_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
auto* input_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
auto* index = ctx.Input<LoDTensor>("Index");
const auto* output_grad_data = output_grad->data<T>();
auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
auto input_num = input_grad->numel();
auto input_dim = input_grad->dims();
auto index_dim = index->dims();
size_t batch_size = index_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
bool same_data_in_index_row = index_length == 1 ? false : true;
auto block_width = platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
auto block_height =
platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
set_zero(dev_ctx, input_grad, static_cast<T>(0));
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data, input_grad_data, output_grad_data, index_length,
input_length, batch_size, same_data_in_index_row);
} else if (index_type == framework::proto::VarType::INT32) {
const int* index_data = index->data<int>();
IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data, input_grad_data, output_grad_data, index_length,
input_length, batch_size, same_data_in_index_row);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
index_sample,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
index_sample_grad,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include <fstream>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DDim = framework::DDim;
template <typename T, typename IndexT = int>
void IndexSampleInner(const framework::ExecutionContext &context,
const LoDTensor &input, const LoDTensor &index,
LoDTensor *output) {
auto input_dims = input.dims();
auto index_dims = index.dims();
int batch_size = input_dims[0];
auto value_length = input_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> input_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(input, context.device_context(),
&input_vec);
paddle::framework::TensorToVector(index, context.device_context(),
&index_vec);
std::vector<T> res(index_ids_num);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i], 0,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i], value_length,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
T v = input_vec[v_i];
VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
<< " value = " << v;
res[i] = v;
}
auto ddim = phi::make_ddim({batch_size, index_length});
output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(res, context.device_context(), output);
output->Resize(ddim);
}
template <typename DeviceContext, typename T>
class IndexSampleKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *input_var = ctx.InputVar("X");
auto *index_var = ctx.InputVar("Index");
auto &input_tensor = input_var->Get<LoDTensor>();
auto &index_tensor = index_var->Get<LoDTensor>();
auto *out_var = ctx.OutputVar("Out");
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
const auto &index_type =
framework::TransToProtoVarType(index_tensor.dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSampleInner<T, int>(ctx, input_tensor, index_tensor, out_tensor);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSampleInner<T, int64_t>(ctx, input_tensor, index_tensor, out_tensor);
}
}
};
template <typename T, typename IndexT = int>
void IndexSampleGradInner(const framework::ExecutionContext &context,
const LoDTensor &out_grad, const LoDTensor &index,
LoDTensor *x_grad) {
std::vector<T> out_grad_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(out_grad, context.device_context(),
&out_grad_vec);
paddle::framework::TensorToVector(index, context.device_context(),
&index_vec);
auto index_dims = index.dims();
auto x_grad_dims = x_grad->dims();
auto value_length = x_grad_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> x_grad_vec(x_grad->numel(), 0);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i], 0,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i], value_length,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
x_grad_vec[v_i] += out_grad_vec[i];
}
x_grad->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad);
x_grad->Resize(x_grad_dims);
}
template <typename DeviceContext, typename T>
class IndexSampleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *index_var = context.InputVar("Index");
auto *x_grad_var = context.OutputVar(framework::GradVarName("X"));
auto *out_grad_var = context.InputVar(framework::GradVarName("Out"));
auto &index_tensor = index_var->Get<LoDTensor>();
auto &out_grad_tensor = out_grad_var->Get<LoDTensor>();
auto *x_grad_tensor = x_grad_var->GetMutable<framework::LoDTensor>();
const auto &index_type =
framework::TransToProtoVarType(index_tensor.dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSampleGradInner<T, int>(context, out_grad_tensor, index_tensor,
x_grad_tensor);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSampleGradInner<T, int64_t>(context, out_grad_tensor, index_tensor,
x_grad_tensor);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/index_sample_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle { namespace paddle {
......
...@@ -474,11 +474,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( ...@@ -474,11 +474,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
for (int it = 0; it < LDGS; it++) { for (int it = 0; it < LDGS; it++) {
#pragma unroll #pragma unroll
for (int jt = 0; jt < VecSize; jt++) { for (int jt = 0; jt < VecSize; jt++) {
U x_tmp = x[it][jt]; U x_tmp = static_cast<U>(x[it][jt]);
U y_tmp = var_cur_row * (x_tmp - mean_cur_row); U y_tmp = var_cur_row * (x_tmp - mean_cur_row);
U dy_tmp = static_cast<U>(gamma[it][jt]) * U dy_tmp = static_cast<U>(gamma[it][jt]) *
static_cast<U>(dout[it][jt]); // scale * dy static_cast<U>(dout[it][jt]); // scale * dy
U dout_tmp = dout[it][jt]; // dy U dout_tmp = static_cast<U>(dout[it][jt]); // dy
// used for get dx (row reduction) // used for get dx (row reduction)
sum_loss1 += dy_tmp; // scale * dy, sum_1 sum_loss1 += dy_tmp; // scale * dy, sum_1
......
...@@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL(
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>, ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>); plat::float16>);
#elif CUDNN_VERSION_MIN(8, 1, 0)
REGISTER_OP_CUDA_KERNEL(
layer_norm,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(
layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
plat::bfloat16>);
#else #else
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
layer_norm, layer_norm,
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/operators/mkldnn/axpy_handler.h" #include "paddle/fluid/operators/mkldnn/axpy_handler.h"
...@@ -502,32 +503,29 @@ struct MergeAdd<platform::XPUDeviceContext, T> { ...@@ -502,32 +503,29 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
out.mutable_value()->mutable_data<T>( out.mutable_value()->mutable_data<T>(
phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}), phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace()); context.GetPlace());
int r =
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
std::unordered_map<int64_t, size_t> rows_to_id; std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) { for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i; rows_to_id[merge_rows[i]] = i;
} }
auto* out_data = out.mutable_value()->data<T>(); auto* y_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>(); auto* x_data = input.value().data<T>();
int xm = input_rows.size();
int ym = merge_rows.size();
int n = input_width; int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]]; xpu::ctx_guard RAII_GUARD(context.x_context());
auto r = xpu::add(context.x_context(), &input_data[i * input_width], int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
&out_data[out_i * input_width], int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
&out_data[out_i * input_width], n); memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
PADDLE_ENFORCE_EQ( merge_rows.data(), ym * sizeof(int64_t));
r, XPU_SUCCESS, memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
platform::errors::External("XPU API return wrong value[%d %s], ", r, input_rows.data(), xm * sizeof(int64_t));
XPUAPIErrorMsg[r])); int r =
} xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
x_rows_data, y_rows_data, xm, n, ym);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
} }
void operator()(const platform::XPUDeviceContext& context, void operator()(const platform::XPUDeviceContext& context,
...@@ -582,15 +580,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> { ...@@ -582,15 +580,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
{static_cast<int64_t>(merged_row_set.size()), input_width}), {static_cast<int64_t>(merged_row_set.size()), input_width}),
context.GetPlace()); context.GetPlace());
int r = float* y_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
std::unordered_map<int64_t, size_t> rows_to_id; std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) { for (size_t i = 0; i < merge_rows.size(); ++i) {
...@@ -603,17 +593,22 @@ struct MergeAdd<platform::XPUDeviceContext, T> { ...@@ -603,17 +593,22 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
} }
auto& input_rows = input->rows(); auto& input_rows = input->rows();
auto* x_data = input->value().data<T>();
int xm = input_rows.size();
int ym = merge_rows.size();
int n = input_width; int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]]; xpu::ctx_guard RAII_GUARD(context.x_context());
auto r = xpu::add( int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
context.x_context(), input->value().data<T>() + i * input_width, int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
&out_data[out_i * input_width], &out_data[out_i * input_width], n); memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
PADDLE_ENFORCE_EQ( merge_rows.data(), ym * sizeof(int64_t));
r, XPU_SUCCESS, memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
platform::errors::External("XPU API return wrong value[%d %s], ", r, input_rows.data(), xm * sizeof(int64_t));
XPUAPIErrorMsg[r])); int r =
} xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
x_rows_data, y_rows_data, xm, n, ym);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
} }
} }
}; };
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -445,6 +446,7 @@ template struct MergeAdd<platform::CUDADeviceContext, double>; ...@@ -445,6 +446,7 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>; template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>; template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template struct MergeAdd<platform::CUDADeviceContext, platform::float16>; template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::bfloat16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>; template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>;
template struct MergeAdd<platform::CUDADeviceContext, template struct MergeAdd<platform::CUDADeviceContext,
platform::complex<double>>; platform::complex<double>>;
......
...@@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker ...@@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker
"The fp32 beta1 power accumulator tensor. Its shape is [1]."); "The fp32 beta1 power accumulator tensor. Its shape is [1].");
AddOutput("Beta2Pow", AddOutput("Beta2Pow",
"The fp32 beta2 power accumulator tensor. Its shape is [1]."); "The fp32 beta2 power accumulator tensor. Its shape is [1].");
AddOutput("FusedIndices",
"The param index of each element in FP32FusedParam. Its shape is "
"[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
AddOutput( AddOutput(
"FusedParamOffsets", "FusedParamOffsets",
"The numel offset of each parameter inside the FP32FusedParam. Its " "The numel offset of each parameter inside the FP32FusedParam. Its "
"shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 " "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
"+ n_2, ...]."); "+ n_2, ...]. It should be in CPUPlace.");
AddOutput("FP32ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp32_local_param_num + 1].");
AddOutput("FP16ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp16_local_param_num + 1].");
AddOutput( AddOutput(
"WeightDecay", "FP32ShardFusedParamOffsets",
"The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N]."); "The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
AddOutput(
"FP16ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
AddOutput("ParamInfo", AddOutput("ParamInfo",
"The param info. It should be in CPUPlace, and its shape is [6]" "The param info. It should be in CPUPlace, and its shape is [6]"
"CPUPlace, and its shape is [6]. It is " "CPUPlace, and its shape is [8]. It is "
"[fp32_shard_param_start_idx, fp32_local_param_num, " "[fp32_shard_param_start_idx, fp32_local_param_num, "
"fp32_global_param_num, fp16_shard_param_start_idx, " "fp32_global_param_num, fp32_weight_decay_end_idx, "
"fp16_local_param_num, fp16_global_param_num]."); "fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num, "
"fp16_weight_decay_end_idx].");
AddOutput("ParamOrder",
"The reordered parameter order. Inside this op, "
"the parameter would be reordered by data type and weight decay "
"value.");
AddOutput("ParamOut", "The output parameter list.").AsDuplicable(); AddOutput("ParamOut", "The output parameter list.").AsDuplicable();
AddOutput("MasterParamOut", AddOutput("MasterParamOut",
"The output master parameter list. It would share the memory of " "The output master parameter list. It would share the memory of "
...@@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker ...@@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker
AddAttr<float>("beta1", "The initial value of Beta1Pow."); AddAttr<float>("beta1", "The initial value of Beta1Pow.");
AddAttr<float>("beta2", "The initial value of Beta2Pow."); AddAttr<float>("beta2", "The initial value of Beta2Pow.");
AddAttr<std::vector<float>>( AddAttr<std::vector<int>>("apply_weight_decay",
"weight_decay", "Whether to apply weight decay.");
"The weight decay for each parameter. Its "
"shape is equal to the global parameter number.");
AddAttr<int>("alignment", "The alignment in bytes for the fused tensors."); AddAttr<int>("alignment", "The alignment in bytes for the fused tensors.");
AddAttr<int>("rank", "The global rank of the current process."); AddAttr<int>("rank", "The global rank of the current process.");
AddAttr<int>("nranks", "The global world size."); AddAttr<int>("nranks", "The global world size.");
......
...@@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin, ...@@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin,
<< ") , dtype = " << fused_out->dtype(); << ") , dtype = " << fused_out->dtype();
} }
template <typename OffsetT, typename IndexT>
static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets,
IndexT *out,
int offset_num,
int out_num) {
CUDA_KERNEL_LOOP_TYPE(i, out_num, int) {
auto idx = phi::funcs::LowerBound(offsets, offset_num, i);
if (idx == offset_num || offsets[idx] != i) {
--idx;
}
out[i] = idx;
}
}
template <typename T>
static void CopyVectorToTensor(const std::vector<T> &src,
framework::Tensor *dst,
const platform::Place &place,
gpuStream_t stream) {
dst->Resize({static_cast<int64_t>(src.size())});
T *dst_ptr = dst->mutable_data<T>(place);
const T *src_ptr = src.data();
auto nbytes = src.size() * sizeof(T);
memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream);
}
template <typename T> template <typename T>
static void CopyVectorToCPUTensor(const std::vector<T> &src, static void CopyVectorToCPUTensor(const std::vector<T> &src,
framework::Tensor *dst) { framework::Tensor *dst) {
...@@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector<T> &src, ...@@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector<T> &src,
std::memcpy(dst_ptr, src_ptr, nbytes); std::memcpy(dst_ptr, src_ptr, nbytes);
} }
static size_t ReorderParamGradInfoList(const std::vector<int> &flags,
std::vector<ParamGradInfo> *infos) {
size_t n = infos->size();
std::vector<int> cur_flags;
cur_flags.reserve(n);
for (size_t i = 0; i < n; ++i) {
auto idx = (*infos)[i].idx;
cur_flags.push_back(flags[idx]);
}
auto origin_infos = *infos;
size_t j = 0;
for (size_t i = 0; i < n; ++i) {
if (cur_flags[i]) {
(*infos)[j] = origin_infos[i];
++j;
}
}
size_t ret_idx = j;
for (size_t i = 0; i < n; ++i) {
if (!cur_flags[i]) {
(*infos)[j] = origin_infos[i];
++j;
}
}
return ret_idx;
}
template <typename T>
static T ClipByBound(T x, T low_value, T high_value) {
if (x < low_value) return low_value;
if (x > high_value) return high_value;
return x;
}
template <typename T> template <typename T>
class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T> class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> { : public framework::OpKernel<T> {
...@@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T> ...@@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
info->numel_offset = 0; // not determined yet info->numel_offset = 0; // not determined yet
} }
} }
const auto &apply_weight_decay =
ctx.Attr<std::vector<int>>("apply_weight_decay");
size_t fp32_wd_end_idx =
ReorderParamGradInfoList(apply_weight_decay, &fp32_infos);
size_t fp16_wd_end_idx =
ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
auto *param_order_t = ctx.Output<framework::Tensor>("ParamOrder");
auto param_num = fp32_infos.size() + fp16_infos.size();
param_order_t->Resize({static_cast<int16_t>(param_num)});
auto *param_order = param_order_t->mutable_data<int>(platform::CPUPlace());
for (size_t i = 0; i < fp32_infos.size(); ++i) {
param_order[i] = static_cast<int>(fp32_infos[i].idx);
}
for (size_t i = 0; i < fp16_infos.size(); ++i) {
param_order[i + fp32_infos.size()] = static_cast<int>(fp16_infos[i].idx);
}
VLOG(10) << "Fill ParamGradInfo ends"; VLOG(10) << "Fill ParamGradInfo ends";
// Step 2: determine the numel_with_padding and numel_offset // Step 2: determine the numel_with_padding and numel_offset
...@@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T> ...@@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
VLOG(10) << "Found the sharding arguments"; VLOG(10) << "Found the sharding arguments";
auto *param_info_t = ctx.Output<framework::Tensor>("ParamInfo"); auto *param_info_t = ctx.Output<framework::Tensor>("ParamInfo");
param_info_t->Resize({6}); param_info_t->Resize({8});
auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace()); auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace());
param_info[0] = static_cast<int>(fp32_start_idx); param_info[0] = static_cast<int>(fp32_start_idx);
param_info[1] = static_cast<int>(fp32_local_param_num); param_info[1] = static_cast<int>(fp32_local_param_num);
param_info[2] = static_cast<int>(fp32_infos.size()); param_info[2] = static_cast<int>(fp32_infos.size());
param_info[3] = static_cast<int>(fp16_start_idx + fp32_infos.size()); param_info[3] = ClipByBound<int>(fp32_wd_end_idx, fp32_start_idx,
param_info[4] = static_cast<int>(fp16_local_param_num); fp32_start_idx + fp32_local_param_num) -
param_info[5] = static_cast<int>(fp16_infos.size()); static_cast<int>(fp32_start_idx);
param_info[4] = static_cast<int>(fp16_start_idx + fp32_infos.size());
param_info[5] = static_cast<int>(fp16_local_param_num);
param_info[6] = static_cast<int>(fp16_infos.size());
param_info[7] = ClipByBound<int>(fp16_wd_end_idx, fp16_start_idx,
fp16_start_idx + fp16_local_param_num) -
static_cast<int>(fp16_start_idx);
VLOG(10) << "Start FP32 idx: " << param_info[0]; VLOG(10) << "Start FP32 idx: " << param_info[0];
VLOG(10) << "Local FP32 param num: " << param_info[1]; VLOG(10) << "Local FP32 param num: " << param_info[1];
VLOG(10) << "Global FP32 param num: " << param_info[2]; VLOG(10) << "Global FP32 param num: " << param_info[2];
VLOG(10) << "Start FP16 idx: " << param_info[3]; VLOG(10) << "Start FP16 idx: " << param_info[4];
VLOG(10) << "Local FP16 param num: " << param_info[4]; VLOG(10) << "Local FP16 param num: " << param_info[5];
VLOG(10) << "Global FP16 param num: " << param_info[5]; VLOG(10) << "Global FP16 param num: " << param_info[6];
// For WeightDecay, shard and perform H2D copy
const auto &origin_weight_decay =
ctx.Attr<std::vector<float>>("weight_decay");
PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(),
platform::errors::InvalidArgument(
"The attr(weight_decay) should have the "
"same length with Input(Param)."));
std::vector<float> shard_weight_decay;
shard_weight_decay.reserve(total_local_param_num);
for (size_t i = 0; i < fp32_local_param_num; ++i) {
shard_weight_decay.push_back(
origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]);
}
for (size_t i = 0; i < fp16_local_param_num; ++i) {
shard_weight_decay.push_back(
origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]);
}
// For FusedIndices, launch CUDA kernel to do binary search
auto *fused_indices_t = ctx.Output<framework::Tensor>("FusedIndices");
fused_indices_t->Resize({static_cast<int64_t>(total_numel)});
auto *fused_indices = fused_indices_t->mutable_data<int>(place);
std::vector<int> numel_offsets; std::vector<int> numel_offsets;
numel_offsets.reserve(params.size() + 1); numel_offsets.reserve(params.size() + 1);
for (const auto &info : fp32_infos) { for (const auto &info : fp32_infos) {
...@@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T> ...@@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
"The numel_offsets number must be one larger than " "The numel_offsets number must be one larger than "
"the parameter number.")); "the parameter number."));
VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets); VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets);
auto *fused_param_offset_t =
ctx.Output<framework::Tensor>("FusedParamOffsets");
fused_param_offset_t->Resize({static_cast<int64_t>(numel_offsets.size())});
auto *fused_param_offset = fused_param_offset_t->mutable_data<int>(place);
memory::Copy(place, fused_param_offset, platform::CPUPlace(),
numel_offsets.data(),
numel_offsets.size() * sizeof(numel_offsets[0]), stream);
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel);
LambFillFusedIndicesCUDAKernel<<<config.block_per_grid,
config.thread_per_block, 0, stream>>>(
fused_param_offset, fused_indices, numel_offsets.size() - 1,
total_numel);
std::vector<int> lengths;
lengths.reserve(fp32_local_param_num + fp16_local_param_num);
std::vector<int> fp32_partial_numel_offsets; std::vector<int> fp32_partial_numel_offsets;
fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1); fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1);
...@@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T> ...@@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
VLOG(10) << "FP32 Partial numel = [" VLOG(10) << "FP32 Partial numel = ["
<< valid_start_n + fp32_infos[i].numel << "," << valid_start_n + fp32_infos[i].numel << ","
<< end_n + fp32_infos[i].numel; << end_n + fp32_infos[i].numel;
lengths.push_back(end_n - valid_start_n); auto len = end_n - valid_start_n;
fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() + fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() +
lengths.back()); len);
} }
std::vector<int> fp16_partial_numel_offsets; std::vector<int> fp16_partial_numel_offsets;
...@@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T> ...@@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
PADDLE_ENFORCE_NE(valid_start_n, end_n, PADDLE_ENFORCE_NE(valid_start_n, end_n,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Indices sharding error. This may be a bug.")); "Indices sharding error. This may be a bug."));
lengths.push_back(end_n - valid_start_n); auto len = end_n - valid_start_n;
fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() + fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() +
lengths.back()); len);
} }
CopyVectorToCPUTensor(numel_offsets, CopyVectorToCPUTensor(numel_offsets,
...@@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T> ...@@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
fp16_partial_numel_offsets, fp16_partial_numel_offsets,
ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets")); ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));
// Fill the weight decay tensor
PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(),
platform::errors::InvalidArgument(
"Invalid weight decay sharding. This may be a bug."));
std::vector<float> wd_cpu;
for (size_t i = 0; i < shard_weight_decay.size(); ++i) {
int len = lengths[i];
for (int j = 0; j < len; ++j) {
wd_cpu.push_back(shard_weight_decay[i]);
}
}
PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel,
platform::errors::InvalidArgument(
"Invalid weight decay sharding. This may be a bug."));
CopyVectorToTensor(wd_cpu, ctx.Output<framework::Tensor>("WeightDecay"),
place, stream);
auto *global_scale = ctx.Output<framework::Tensor>("GlobalScale"); auto *global_scale = ctx.Output<framework::Tensor>("GlobalScale");
if (!global_scale->IsInitialized()) { if (!global_scale->IsInitialized()) {
TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f); TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);
......
...@@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
"The fp32 beta1 power accumulator tensor. Its shape is [1]."); "The fp32 beta1 power accumulator tensor. Its shape is [1].");
AddInput("Beta2Pow", AddInput("Beta2Pow",
"The fp32 beta2 power accumulator tensor. Its shape is [1]."); "The fp32 beta2 power accumulator tensor. Its shape is [1].");
AddInput("FusedIndices",
"The param index of each element in FP32FusedParam. Its shape is "
"[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
AddInput( AddInput(
"FusedParamOffsets", "FusedParamOffsets",
"The numel offset of each parameter inside the FP32FusedParam. Its " "The numel offset of each parameter inside the FP32FusedParam. Its "
"shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 " "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
"+ n_2, ...]."); "+ n_2, ...]. It should be in CPUPlace.");
AddInput("FP32ShardFusedParamOffsets", AddInput(
"The sharded numel offset of each parameter in the local rank. " "FP32ShardFusedParamOffsets",
"Its shape is [fp32_local_param_num + 1]."); "The sharded numel offset of each parameter in the local rank. "
AddInput("FP16ShardFusedParamOffsets", "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
"The sharded numel offset of each parameter in the local rank. " AddInput(
"Its shape is [fp16_local_param_num + 1]."); "FP16ShardFusedParamOffsets",
AddInput("WeightDecay", "The sharded numel offset of each parameter in the local rank. "
"The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N]."); "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
AddInput("ParamInfo", AddInput("ParamInfo",
"The param info. It should be in CPUPlace, and its shape is [6]" "The param info. It should be in CPUPlace, and its shape is [6]"
"CPUPlace, and its shape is [6]. It is " "CPUPlace, and its shape is [8]. It is "
"[fp32_shard_param_start_idx, fp32_local_param_num, " "[fp32_shard_param_start_idx, fp32_local_param_num, "
"fp32_global_param_num, fp16_shard_param_start_idx, " "fp32_global_param_num, fp32_weight_decay_end_idx, "
"fp16_local_param_num, fp16_global_param_num]."); "fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num, "
"fp16_weight_decay_end_idx].");
AddInput("ParamOrder",
"The reordered parameter order. Inside this op, "
"the parameter would be reordered by data type and weight decay "
"value.");
AddInput("LearningRate", AddInput("LearningRate",
"The fp32 learning rate tensor. Its shape is [1]."); "The fp32 learning rate tensor. Its shape is [1].");
...@@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
"max_global_grad_norm", "max_global_grad_norm",
"The maximum global gradient l2-norm value for clipping. If " "The maximum global gradient l2-norm value for clipping. If "
"max_global_grad_norm <= 0, no clipping would be performed."); "max_global_grad_norm <= 0, no clipping would be performed.");
AddAttr<float>("weight_decay", "The weight decay value.");
AddAttr<bool>("clip_after_allreduce", AddAttr<bool>("clip_after_allreduce",
"Whether to clip before allreduce, only valid when the " "Whether to clip before allreduce, only valid when the "
"world size is larger than 1."); "world size is larger than 1.");
......
...@@ -87,7 +87,7 @@ struct L2NormFunctor { ...@@ -87,7 +87,7 @@ struct L2NormFunctor {
} }
}; };
template <typename InT, typename OutT, int BlockDim, bool NeedSqrt> template <typename InT, typename OutT, int BlockDim>
static __global__ void MultiTensorL2NormReduceAgainCUDAKernel( static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
const InT *x, OutT *y, int max_chunk_num) { const InT *x, OutT *y, int max_chunk_num) {
int tensor_id = blockIdx.x; int tensor_id = blockIdx.x;
...@@ -100,11 +100,7 @@ static __global__ void MultiTensorL2NormReduceAgainCUDAKernel( ...@@ -100,11 +100,7 @@ static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
} }
sum = BlockReduce(storage).Reduce(sum, cub::Sum()); sum = BlockReduce(storage).Reduce(sum, cub::Sum());
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
if (NeedSqrt) { y[blockIdx.x] = static_cast<OutT>(sum);
y[blockIdx.x] = static_cast<OutT>(sqrtf(sum));
} else {
y[blockIdx.x] = static_cast<OutT>(sum);
}
} }
} }
...@@ -118,6 +114,7 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { ...@@ -118,6 +114,7 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
constexpr int vec8 = alignof(platform::AlignedVector<T, 8>); constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
constexpr int vec4 = alignof(platform::AlignedVector<T, 4>); constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
constexpr int vec2 = alignof(platform::AlignedVector<T, 2>); constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
chunk_size *= sizeof(T);
if (address % vec8 == 0 && chunk_size % vec8 == 0) { if (address % vec8 == 0 && chunk_size % vec8 == 0) {
return std::min(8, valid_vec_size); return std::min(8, valid_vec_size);
} else if (address % vec4 == 0 && chunk_size % vec4 == 0) { } else if (address % vec4 == 0 && chunk_size % vec4 == 0) {
...@@ -129,27 +126,26 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { ...@@ -129,27 +126,26 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
} }
} }
#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \ #define PD_VEC_LAUNCH_KERNEL_CASE(__vec_size, ...) \
case __vec_size: { \ case __vec_size: { \
constexpr int kVecSize = __vec_size; \ constexpr int kVecSize = __vec_size; \
__VA_ARGS__; \ __VA_ARGS__; \
break; \ break; \
} }
#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...) \ #define PD_VEC_LAUNCH_KERNEL(__vec_size, ...) \
do { \ do { \
switch (__vec_size) { \ switch (__vec_size) { \
PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \ PD_VEC_LAUNCH_KERNEL_CASE(8, __VA_ARGS__); \
PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \ PD_VEC_LAUNCH_KERNEL_CASE(4, __VA_ARGS__); \
PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \ PD_VEC_LAUNCH_KERNEL_CASE(2, __VA_ARGS__); \
PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \ PD_VEC_LAUNCH_KERNEL_CASE(1, __VA_ARGS__); \
} \ } \
} while (0) } while (0)
// TODO(zengjinle): which chunk_size is better? // TODO(zengjinle): which chunk_size is better?
template <typename InT, typename OutT, bool NeedSqrt = false, template <typename InT, typename OutT, int MaxTensorNumPerLaunch = 160,
int MaxTensorNumPerLaunch = 50, int MaxChunkNumPerLaunch = 680, int MaxChunkNumPerLaunch = 780>
int BlockDim = 512>
static void MultiTensorL2Norm(const platform::CUDAPlace &place, static void MultiTensorL2Norm(const platform::CUDAPlace &place,
gpuStream_t stream, const InT *x, gpuStream_t stream, const InT *x,
const int *offsets, int n, OutT *y, const int *offsets, int n, OutT *y,
...@@ -158,7 +154,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, ...@@ -158,7 +154,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
constexpr int kNumTensor = MaxTensorNumPerLaunch; constexpr int kNumTensor = MaxTensorNumPerLaunch;
constexpr int kNumChunk = MaxChunkNumPerLaunch; constexpr int kNumChunk = MaxChunkNumPerLaunch;
constexpr int kBlockDim = BlockDim; constexpr int kBlockDim = 512;
int max_chunk_num = -1; int max_chunk_num = -1;
int vec_size = 8; int vec_size = 8;
...@@ -181,22 +177,22 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, ...@@ -181,22 +177,22 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num); auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream); FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);
#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL \ #define PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL \
do { \ do { \
using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>; \ using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>; \
VLOG(10) << __func__ << " " << typeid(InT).name() \ VLOG(10) << __func__ << " " << typeid(InT).name() \
<< " VecSize = " << kVecSize; \ << " VecSize = " << kVecSize; \
MultiTensorApply<FunctorT, kBlockDim, kNumTensor, kNumChunk>( \ MultiTensorApply<FunctorT, kNumTensor, kNumChunk>( \
FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \ FunctorT(), stream, offsets, n, chunk_size, kBlockDim, x, tmp_out_ptr, \
max_chunk_num); \ max_chunk_num); \
} while (0) } while (0)
PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL); PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL);
#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL #undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL
MultiTensorL2NormReduceAgainCUDAKernel<MT, OutT, kBlockDim, MultiTensorL2NormReduceAgainCUDAKernel<
NeedSqrt><<<n, kBlockDim, 0, stream>>>( MT, OutT, kBlockDim><<<n, kBlockDim, 0, stream>>>(tmp_out_ptr, y,
tmp_out_ptr, y, max_chunk_num); max_chunk_num);
} }
template <int LogLevel> template <int LogLevel>
...@@ -208,34 +204,17 @@ static void LogParamAndTrustRatioDivSquareNorm( ...@@ -208,34 +204,17 @@ static void LogParamAndTrustRatioDivSquareNorm(
auto tensors = ctx.MultiInput<framework::Tensor>("Param"); auto tensors = ctx.MultiInput<framework::Tensor>("Param");
if (tensors.empty()) return; if (tensors.empty()) return;
const auto *order = ctx.Input<framework::Tensor>("ParamOrder")->data<int>();
size_t n = tensors.size(); size_t n = tensors.size();
auto place = tensors[0]->place(); auto place = tensors[0]->place();
auto pn_vec = ToVector(param_square_norm, n, place); auto pn_vec = ToVector(param_square_norm, n, place);
auto tn_vec = ToVector(trust_ratio_div_square_norm, n, place); auto tn_vec = ToVector(trust_ratio_div_square_norm, n, place);
std::vector<size_t> fp32_indices, fp16_indices;
fp32_indices.reserve(n);
fp16_indices.reserve(n);
for (size_t i = 0; i < n; ++i) {
const auto *t = tensors[i];
if (t->dtype() == phi::DataType::FLOAT32) {
fp32_indices.push_back(i);
} else if (t->dtype() == phi::DataType::FLOAT16) {
fp16_indices.push_back(i);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unsupported data type %s.", t->dtype()));
}
}
for (auto idx : fp16_indices) {
fp32_indices.push_back(idx);
}
const auto &names = ctx.GetOp().Inputs("Param"); const auto &names = ctx.GetOp().Inputs("Param");
for (size_t i = 0; i < fp32_indices.size(); ++i) { for (size_t i = 0; i < n; ++i) {
auto idx = fp32_indices[i]; auto idx = order[i];
VLOG(LogLevel) << "Param " << tensors[idx]->dtype() << " " << names[idx] VLOG(LogLevel) << "Param " << tensors[idx]->dtype() << " " << names[idx]
<< " pn = " << pn_vec[i] << " , tn = " << tn_vec[i]; << " pn = " << pn_vec[i] << " , tn = " << tn_vec[i];
} }
...@@ -353,7 +332,7 @@ static __global__ void CalcGradNormClipBeforeAllReduceScale( ...@@ -353,7 +332,7 @@ static __global__ void CalcGradNormClipBeforeAllReduceScale(
const T1 *__restrict__ global_scale, T1 max_global_grad_norm, const T1 *__restrict__ global_scale, T1 max_global_grad_norm,
const T1 *__restrict__ square_grad_norm, T1 *__restrict__ out1, const T1 *__restrict__ square_grad_norm, T1 *__restrict__ out1,
T2 *__restrict__ out2, T1 clip_rescale_grad) { T2 *__restrict__ out2, T1 clip_rescale_grad) {
T1 grad_norm = static_cast<T1>(sqrt(*square_grad_norm)) * clip_rescale_grad; T1 grad_norm = static_cast<T1>(sqrtf(*square_grad_norm)) * clip_rescale_grad;
T1 scale = global_scale[0] * max_global_grad_norm / (1e-6 + grad_norm); T1 scale = global_scale[0] * max_global_grad_norm / (1e-6 + grad_norm);
bool found_nan_inf = !isfinite(scale); bool found_nan_inf = !isfinite(scale);
if (scale >= 1 || found_nan_inf) { if (scale >= 1 || found_nan_inf) {
...@@ -380,19 +359,24 @@ static __global__ void SetNanInfValueCUDAKernelTwoFlag(const bool *in_flag_p_1, ...@@ -380,19 +359,24 @@ static __global__ void SetNanInfValueCUDAKernelTwoFlag(const bool *in_flag_p_1,
((*in_flag_p_1) || (*in_flag_p_2)) ? __int_as_float(0x7fffffffU) : 0.0f; ((*in_flag_p_1) || (*in_flag_p_2)) ? __int_as_float(0x7fffffffU) : 0.0f;
} }
// TODO(zengjinle): Vectorize this function template <typename T, typename GradT, int VecSize>
// NOTE: this method does not update Beta1Pow and Beta2Pow! static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
template <typename T, typename GradT, typename IndexT>
static __global__ void UpdateLambMoment(
const T *__restrict__ param_p, const GradT *__restrict__ grad_p, const T *__restrict__ param_p, const GradT *__restrict__ grad_p,
const T *__restrict__ square_grad_norm_p, const T *__restrict__ square_grad_norm_p,
const T *__restrict__ global_scale, const IndexT *__restrict__ indices, const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p,
const T *__restrict__ weight_decay_p, const T *__restrict__ beta1pow_p,
const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p, const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p,
T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, T beta1, T beta2, T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf,
T epsilon, T max_global_grad_norm, int num, T rescale_grad) { T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon,
T max_global_grad_norm, int num, T rescale_grad) {
T square_grad_norm = *square_grad_norm_p; T square_grad_norm = *square_grad_norm_p;
if (!isfinite(square_grad_norm)) return; bool need_update_found_inf =
(found_inf && threadIdx.x == 0 && blockIdx.x == 0);
if (!isfinite(square_grad_norm)) {
if (need_update_found_inf) *found_inf = true;
return;
} else if (need_update_found_inf) {
*found_inf = false;
}
T scale = rescale_grad / global_scale[0]; T scale = rescale_grad / global_scale[0];
if (max_global_grad_norm > 0) { if (max_global_grad_norm > 0) {
...@@ -406,27 +390,112 @@ static __global__ void UpdateLambMoment( ...@@ -406,27 +390,112 @@ static __global__ void UpdateLambMoment(
T one_minus_beta1pow = 1 - beta1pow_p[0]; T one_minus_beta1pow = 1 - beta1pow_p[0];
T one_minus_beta2pow = 1 - beta2pow_p[0]; T one_minus_beta2pow = 1 - beta2pow_p[0];
CUDA_KERNEL_LOOP(i, num) { int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
T p = param_p[i]; int stride = blockDim.x * gridDim.x * VecSize;
T g = static_cast<T>(grad_p[i]) * scale;
T weight_decay = weight_decay_p[i]; for (; i + VecSize <= num; i += stride) {
T mom1 = mom1_p[i]; platform::AlignedVector<T, VecSize> param_vec;
T mom2 = mom2_p[i]; platform::AlignedVector<GradT, VecSize> grad_vec;
platform::AlignedVector<T, VecSize> weight_decay_vec;
mom1 = beta1 * mom1 + (1 - beta1) * g; platform::AlignedVector<T, VecSize> mom1_vec;
mom2 = beta2 * mom2 + (1 - beta2) * g * g; platform::AlignedVector<T, VecSize> mom2_vec;
platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
T mom1_unbiased = mom1 / one_minus_beta1pow;
T mom2_unbiased = mom2 / one_minus_beta2pow; T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
T trust_ratio_div = if (cur_weight_decay != static_cast<T>(0.0)) {
mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + weight_decay * p; platform::Load(param_p + i, &param_vec);
} else {
mom1_p[i] = mom1; #pragma unroll
mom2_p[i] = mom2; for (int j = 0; j < VecSize; ++j) {
trust_ratio_div_p[i] = trust_ratio_div; param_vec[j] = static_cast<T>(0);
}
}
platform::Load(grad_p + i, &grad_vec);
platform::Load(mom1_p + i, &mom1_vec);
platform::Load(mom2_p + i, &mom2_vec);
#define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2, \
__trust_ratio_div, __idx) \
T p = __param[__idx]; \
T g = static_cast<T>(__grad[__idx]) * scale; \
T mom1 = __mom1[__idx]; \
T mom2 = __mom2[__idx]; \
mom1 = beta1 * mom1 + (1 - beta1) * g; \
mom2 = beta2 * mom2 + (1 - beta2) * g * g; \
T mom1_unbiased = mom1 / one_minus_beta1pow; \
T mom2_unbiased = mom2 / one_minus_beta2pow; \
__trust_ratio_div[__idx] = \
mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + cur_weight_decay * p; \
__mom1[__idx] = mom1; \
__mom2[__idx] = mom2;
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_vec, grad_vec, mom1_vec,
mom2_vec, trust_ratio_div_vec, j);
}
platform::Store(mom1_vec, mom1_p + i);
platform::Store(mom2_vec, mom2_p + i);
platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
}
for (; i < num; ++i) {
T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_p, grad_p, mom1_p, mom2_p,
trust_ratio_div_p, i);
} }
} }
template <typename T, typename GradT>
static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
const T *param_p, const GradT *grad_p, const T *square_grad_norm_p,
const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p,
T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay,
int weight_decay_end_idx, T beta1, T beta2, T epsilon,
T max_global_grad_norm, T rescale_grad) {
if (n <= 0) return;
int numel = offsets[n] - offsets[0];
PADDLE_ENFORCE_GE(weight_decay_end_idx, 0,
platform::errors::InvalidArgument(
"The weight decay end index should be >= 0."));
PADDLE_ENFORCE_LE(weight_decay_end_idx, n,
platform::errors::InvalidArgument(
"The weight decay end index should be < %d.", n));
auto weight_decay_end_numel = offsets[weight_decay_end_idx] - offsets[0];
int vec_size = GetChunkedVecSize(param_p, 0);
vec_size = std::min(vec_size, GetChunkedVecSize(grad_p, 0));
vec_size = std::min(vec_size, GetChunkedVecSize(mom1_p, 0));
vec_size = std::min(vec_size, GetChunkedVecSize(mom2_p, 0));
vec_size = std::min(vec_size, GetChunkedVecSize(trust_ratio_div_p, 0));
for (int i = 0; i < n; ++i) {
auto length = offsets[i + 1] - offsets[i];
while (length % vec_size != 0) {
vec_size /= 2;
}
}
VLOG(1) << __func__ << " VecSize = " << vec_size;
auto stream = dev_ctx.stream();
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \
do { \
UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize><<< \
config.block_per_grid, config.thread_per_block, 0, stream>>>( \
param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \
beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, \
weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \
max_global_grad_norm, numel, rescale_grad); \
} while (0)
PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL);
#undef PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL
}
template <typename T, bool NeedUpdate /*=true*/> template <typename T, bool NeedUpdate /*=true*/>
struct LambBetaPowUpdateOnceHelper { struct LambBetaPowUpdateOnceHelper {
LambBetaPowUpdateOnceHelper(T *beta1pow, T *beta2pow, T beta1, T beta2) { LambBetaPowUpdateOnceHelper(T *beta1pow, T *beta2pow, T beta1, T beta2) {
...@@ -468,33 +537,6 @@ struct LambBetaPowUpdateOnceHelper<T, false> { ...@@ -468,33 +537,6 @@ struct LambBetaPowUpdateOnceHelper<T, false> {
HOSTDEVICE void UpdateBetaPows() const {} HOSTDEVICE void UpdateBetaPows() const {}
}; };
template <bool HasFoundInf /*=true*/>
struct LambFoundInfHelper {
public:
explicit LambFoundInfHelper(bool *found_inf) : found_inf_(found_inf) {
PADDLE_ENFORCE_NOT_NULL(found_inf,
platform::errors::InvalidArgument(
"The found_inf should not be nullptr."));
}
HOSTDEVICE void UpdateFoundInf(bool value) { *found_inf_ = value; }
private:
bool *__restrict__ found_inf_;
};
template <>
struct LambFoundInfHelper<false> {
public:
explicit LambFoundInfHelper(bool *found_inf) {
PADDLE_ENFORCE_EQ(
found_inf, nullptr,
platform::errors::InvalidArgument("The found_inf should be nullptr."));
}
HOSTDEVICE void UpdateFoundInf(bool) {}
};
template <typename T, bool HasMasterParam /*=true*/> template <typename T, bool HasMasterParam /*=true*/>
struct LambParamHelper { struct LambParamHelper {
LambParamHelper(T *param, MasterT<T> *master_param) { LambParamHelper(T *param, MasterT<T> *master_param) {
...@@ -509,12 +551,9 @@ struct LambParamHelper { ...@@ -509,12 +551,9 @@ struct LambParamHelper {
master_param_ = master_param; master_param_ = master_param;
} }
HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) { HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
param_[i] = static_cast<T>(updated_p);
master_param_[i] = updated_p;
}
HOSTDEVICE MasterT<T> GetParam(int i) { return master_param_[i]; } HOSTDEVICE MasterT<T> *__restrict__ MasterParamPtr() { return master_param_; }
private: private:
T *__restrict__ param_; T *__restrict__ param_;
...@@ -538,158 +577,169 @@ struct LambParamHelper<T, false> { ...@@ -538,158 +577,169 @@ struct LambParamHelper<T, false> {
param_ = param; param_ = param;
} }
HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) { HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
param_[i] = static_cast<T>(updated_p);
}
HOSTDEVICE MasterT<T> GetParam(int i) { HOSTDEVICE constexpr MasterT<T> *MasterParamPtr() { return nullptr; }
return static_cast<MasterT<T>>(param_[i]);
}
private: private:
T *__restrict__ param_; T *__restrict__ param_;
}; };
template <typename ParamT, typename IndexT, bool HasMasterParam, template <typename ParamT, bool HasMasterParam, bool NeedUpdateBetaPow,
bool NeedUpdateBetaPow, bool HasFoundInf> int VecSize>
struct LambParamAndBetaPowsUpdateHelper struct LambUpdateParamAndBetaPowsFunctor {
: public LambParamHelper<ParamT, HasMasterParam>, DEVICE void operator()(
public LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>, int tensor_id, int chunk_id, int offset, int size,
public LambFoundInfHelper<HasFoundInf> { LambParamHelper<ParamT, HasMasterParam> param_helper,
LambParamAndBetaPowsUpdateHelper( const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
bool *found_inf, const MasterT<ParamT> *trust_ratio_div,
const MasterT<ParamT> *lr, const IndexT *index,
const MasterT<ParamT> *param_square_norm, const MasterT<ParamT> *param_square_norm,
const MasterT<ParamT> *trust_ratio_div_square_norm, const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
const MasterT<ParamT> *update_flag) LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>
: LambParamHelper<ParamT, HasMasterParam>(param, master_param), betapow_helper) const {
LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>( if (*found_inf) return;
beta1pow, beta2pow, beta1, beta2),
LambFoundInfHelper<HasFoundInf>(found_inf), using MT = MasterT<ParamT>;
trust_ratio_div(trust_ratio_div),
lr(lr),
index(index),
param_square_norm(param_square_norm),
trust_ratio_div_square_norm(trust_ratio_div_square_norm),
update_flag(update_flag) {}
const MasterT<ParamT> *__restrict__ trust_ratio_div;
const MasterT<ParamT> *__restrict__ lr;
const IndexT *__restrict__ index;
const MasterT<ParamT> *__restrict__ param_square_norm;
const MasterT<ParamT> *__restrict__ trust_ratio_div_square_norm;
const MasterT<ParamT> *__restrict__ update_flag;
};
template <typename ParamT, typename IndexT, bool HasMasterParam, MT p_square_norm = param_square_norm[tensor_id];
bool NeedUpdateBetaPow, bool HasFoundInf> MT t_square_norm = trust_ratio_div_square_norm[tensor_id];
static __global__ void LambUpdateParamAndBetaPowsCUDAKernel( MT lr_value = *lr;
LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, HasMasterParam, MT ratio = (p_square_norm != static_cast<MT>(0) &&
NeedUpdateBetaPow, HasFoundInf> t_square_norm != static_cast<MT>(0)
args, ? lr_value * sqrtf(p_square_norm / t_square_norm)
int num) { : lr_value);
auto should_update = *args.update_flag;
if (!isfinite(should_update)) { int i;
if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) { int stride = blockDim.x * VecSize;
args.UpdateFoundInf(true);
ParamT *param = param_helper.ParamPtr() + offset;
MT *master_param = HasMasterParam ? param_helper.MasterParamPtr() + offset
: param_helper.MasterParamPtr();
trust_ratio_div += offset;
for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) {
platform::AlignedVector<MT, VecSize> trust_ratio_div_vec;
platform::Load(trust_ratio_div + i, &trust_ratio_div_vec);
if (HasMasterParam) {
platform::AlignedVector<MT, VecSize> master_param_vec;
platform::Load(master_param + i, &master_param_vec);
platform::AlignedVector<ParamT, VecSize> param_vec;
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j];
master_param_vec[j] = p;
param_vec[j] = static_cast<ParamT>(p);
}
platform::Store(master_param_vec, master_param + i);
platform::Store(param_vec, param + i);
} else {
platform::AlignedVector<ParamT, VecSize> param_vec;
platform::Load(param + i, &param_vec);
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
MT p = static_cast<MT>(param_vec[j]) - ratio * trust_ratio_div_vec[j];
param_vec[j] = static_cast<ParamT>(p);
}
platform::Store(param_vec, param + i);
}
}
for (; i < size; ++i) {
if (HasMasterParam) {
MT p = master_param[i] - ratio * trust_ratio_div[i];
master_param[i] = p;
param[i] = static_cast<ParamT>(p);
} else {
MT p = static_cast<MT>(param[i]) - ratio * trust_ratio_div[i];
param[i] = static_cast<ParamT>(p);
}
}
if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) {
betapow_helper.UpdateBetaPows();
} }
return;
} else if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) {
args.UpdateFoundInf(false);
} }
};
if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) { // TODO(zengjinle): which block_dim and chunk_size would be better?
args.UpdateBetaPows(); template <typename ParamT, int MaxTensorNumPerLaunch = 160,
int MaxChunkNumPerLaunch = 780>
static void MultiTensorUpdateLambParamAndBetaPows(
const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
const MasterT<ParamT> *param_square_norm,
const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
int chunk_size = 65536) {
constexpr bool kHasMasterParam =
!(std::is_same<ParamT, MasterT<ParamT>>::value);
bool has_beta_pow = (beta1pow != nullptr);
if (has_beta_pow) {
PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument(
"Beta2Pow should not be nullptr."));
} else {
PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument(
"Beta2Pow should be nullptr."));
} }
using MT = MasterT<ParamT>; const int block_dim = 512;
MT lr_value = *args.lr; int vec_size = 8;
CUDA_KERNEL_LOOP(i, num) { for (int i = 0; i < n; ++i) {
MT p = args.GetParam(i); int offset = offsets[i] - offsets[0];
MT t = args.trust_ratio_div[i]; vec_size =
auto norm_idx = args.index[i]; std::min(vec_size, GetChunkedVecSize(param + offset, chunk_size));
MT p_square_norm = args.param_square_norm[norm_idx]; if (kHasMasterParam) {
MT t_square_norm = args.trust_ratio_div_square_norm[norm_idx]; vec_size = std::min(vec_size,
GetChunkedVecSize(master_param + offset, chunk_size));
}
vec_size = std::min(
vec_size, GetChunkedVecSize(trust_ratio_div + offset, chunk_size));
}
MT p_norm = static_cast<MT>(sqrtf(p_square_norm)); VLOG(1) << __func__ << " VecSize = " << vec_size;
MT t_norm = static_cast<MT>(sqrtf(t_square_norm));
auto update = (p_norm != static_cast<MT>(0) && t_norm != static_cast<MT>(0)) constexpr auto kNumTensor = MaxTensorNumPerLaunch;
? p_norm / t_norm constexpr auto kNumChunk = MaxChunkNumPerLaunch;
: static_cast<MT>(1);
MT updated_p = p - lr_value * update * t; auto stream = dev_ctx.stream();
args.SetParam(i, updated_p); #define PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(__has_beta_pow) \
} do { \
} using FunctorT = \
LambUpdateParamAndBetaPowsFunctor<ParamT, kHasMasterParam, \
__has_beta_pow, kVecSize>; \
LambParamHelper<ParamT, kHasMasterParam> param_helper(param, \
master_param); \
LambBetaPowUpdateOnceHelper<MasterT<ParamT>, __has_beta_pow> \
betapow_helper(beta1pow, beta2pow, beta1, beta2); \
launcher.Launch(FunctorT(), param_helper, trust_ratio_div, lr, \
param_square_norm, trust_ratio_div_square_norm, found_inf, \
betapow_helper); \
} while (0)
template <typename ParamT, typename IndexT> #define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE \
static void LambUpdateParamAndBetaPows( do { \
const platform::CUDADeviceContext &dev_ctx, auto callback = [&]( \
const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr, const MultiTensorLauncher<kNumTensor, kNumChunk> &launcher, \
const IndexT *index, const MasterT<ParamT> *param_square_norm, int launch_n) { \
const MasterT<ParamT> *trust_ratio_div_square_norm, if (has_beta_pow && launch_n == 0) { \
const MasterT<ParamT> *update_flag, MasterT<ParamT> **beta1pow, PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true); \
MasterT<ParamT> **beta2pow, bool **found_inf, MasterT<ParamT> beta1, beta1pow = nullptr; \
MasterT<ParamT> beta2, int num, ParamT *param, beta2pow = nullptr; \
MasterT<ParamT> *master_param, gpuStream_t stream) { } else { \
if (num == 0) return; PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false); \
} \
bool has_master_param = !(std::is_same<ParamT, MasterT<ParamT>>::value); }; \
auto has_beta_pow = (*beta1pow) != nullptr && (*beta2pow) != nullptr; MultiTensorApplyWithCallback<kNumTensor, kNumChunk>( \
auto has_found_inf = (*found_inf) != nullptr; stream, offsets, n, chunk_size, block_dim, callback); \
#define PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL( \
__has_master_param, __has_beta_pow, __has_found_inf) \
do { \
LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, __has_master_param, \
__has_beta_pow, __has_found_inf> \
helper(param, master_param, *beta1pow, *beta2pow, beta1, beta2, \
*found_inf, trust_ratio_div, lr, index, param_square_norm, \
trust_ratio_div_square_norm, update_flag); \
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, num); \
LambUpdateParamAndBetaPowsCUDAKernel<<< \
config.block_per_grid, config.thread_per_block, 0, stream>>>(helper, \
num); \
} while (0) } while (0)
if (has_master_param) { PD_VEC_LAUNCH_KERNEL(vec_size,
if (has_beta_pow) { PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE);
if (has_found_inf) {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, true);
} else {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, false);
}
} else {
if (has_found_inf) {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, true);
} else {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, false);
}
}
} else {
if (has_beta_pow) {
if (has_found_inf) {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, true);
} else {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, false);
}
} else {
if (has_found_inf) {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, true);
} else {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, false);
}
}
}
*beta1pow = nullptr; #undef PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW
*beta2pow = nullptr; #undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE
*found_inf = nullptr;
#undef PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL
} }
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
...@@ -1005,15 +1055,16 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1005,15 +1055,16 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
"Too many parameter number. Only <= %d is supported.", "Too many parameter number. Only <= %d is supported.",
std::numeric_limits<int>::max())); std::numeric_limits<int>::max()));
// Step 3: Get FusedIndices, ParamInfo // Step 3: Get ParamInfo
const auto *indices = GetInputTensorPtr<int>(ctx, "FusedIndices");
const auto *param_info_tensor = GetInputTensorPtr<int>(ctx, "ParamInfo"); const auto *param_info_tensor = GetInputTensorPtr<int>(ctx, "ParamInfo");
auto fp32_local_start_idx = param_info_tensor[0]; auto fp32_local_start_idx = param_info_tensor[0];
auto fp32_local_param_num = param_info_tensor[1]; auto fp32_local_param_num = param_info_tensor[1];
auto fp32_global_param_num = param_info_tensor[2]; auto fp32_global_param_num = param_info_tensor[2];
auto fp16_local_start_idx = param_info_tensor[3]; auto fp32_weight_decay_end_idx = param_info_tensor[3];
auto fp16_local_param_num = param_info_tensor[4]; auto fp16_local_start_idx = param_info_tensor[4];
auto fp16_global_param_num = param_info_tensor[5]; auto fp16_local_param_num = param_info_tensor[5];
auto fp16_global_param_num = param_info_tensor[6];
auto fp16_weight_decay_end_idx = param_info_tensor[7];
auto local_param_num = fp32_local_param_num + fp16_local_param_num; auto local_param_num = fp32_local_param_num + fp16_local_param_num;
auto param_num = fp32_global_param_num + fp16_global_param_num; auto param_num = fp32_global_param_num + fp16_global_param_num;
...@@ -1031,7 +1082,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1031,7 +1082,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
<< " , fp16_global_param_num = " << fp16_global_param_num; << " , fp16_global_param_num = " << fp16_global_param_num;
// Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow, // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow,
// WeightDecay, GlobalScale, FoundInf // GlobalScale, FoundInf
const auto *global_scale = GetInputTensorPtr<float>(ctx, "GlobalScale"); const auto *global_scale = GetInputTensorPtr<float>(ctx, "GlobalScale");
const auto *lr = GetInputTensorPtr<float>(ctx, "LearningRate"); const auto *lr = GetInputTensorPtr<float>(ctx, "LearningRate");
int64_t partial_numel = 0; int64_t partial_numel = 0;
...@@ -1065,14 +1116,15 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1065,14 +1116,15 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
GetSameInOutTensorPtr<float>(ctx, place, "Beta1Pow", "Beta1PowOut"); GetSameInOutTensorPtr<float>(ctx, place, "Beta1Pow", "Beta1PowOut");
auto *beta2pow = auto *beta2pow =
GetSameInOutTensorPtr<float>(ctx, place, "Beta2Pow", "Beta2PowOut"); GetSameInOutTensorPtr<float>(ctx, place, "Beta2Pow", "Beta2PowOut");
const float *weight_decay = GetInputTensorPtr<float>(ctx, "WeightDecay");
auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf"); auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
found_inf_t->Resize({1}); found_inf_t->Resize({1});
auto *found_inf = found_inf_t->mutable_data<bool>(place); auto *found_inf = found_inf_t->mutable_data<bool>(place);
// Step 5: Get attributes beta1, beta2, epsilon, max_grad_norm, ring_id, // Step 5: Get attributes weight_decay, beta1, beta2, epsilon,
// max_grad_norm, ring_id,
// use_master_param_norm, is_grad_scaled_by_nranks // use_master_param_norm, is_grad_scaled_by_nranks
auto weight_decay = ctx.Attr<float>("weight_decay");
auto beta1 = ctx.Attr<float>("beta1"); auto beta1 = ctx.Attr<float>("beta1");
auto beta2 = ctx.Attr<float>("beta2"); auto beta2 = ctx.Attr<float>("beta2");
auto epsilon = ctx.Attr<float>("epsilon"); auto epsilon = ctx.Attr<float>("epsilon");
...@@ -1105,7 +1157,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1105,7 +1157,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
platform::float16 *fp16_sum_grad; platform::float16 *fp16_sum_grad;
auto fp32_numel_each_device = fp32_numel / num_devices; auto fp32_numel_each_device = fp32_numel / num_devices;
auto fp16_numel_each_device = fp16_numel / num_devices; auto fp16_numel_each_device = fp16_numel / num_devices;
if (num_devices > 1) { if (num_devices > 1 ||
(max_global_grad_norm > 0 && !clip_after_allreduce)) {
auto ptr = sum_grad_buffer.Alloc<uint8_t>( auto ptr = sum_grad_buffer.Alloc<uint8_t>(
fp32_numel_each_device * sizeof(float) + fp32_numel_each_device * sizeof(float) +
fp16_numel_each_device * sizeof(platform::float16)); fp16_numel_each_device * sizeof(platform::float16));
...@@ -1181,7 +1234,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1181,7 +1234,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
float, platform::float16><<<1, 1, 0, stream>>>( float, platform::float16><<<1, 1, 0, stream>>>(
global_scale, max_global_grad_norm, fp32_square_grad_norm, global_scale, max_global_grad_norm, fp32_square_grad_norm,
fp32_scale, fp16_scale, clip_scale); fp32_scale, fp16_scale, clip_scale);
VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place); if (fp32_scale) {
VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
} else {
VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place);
}
if (num_devices > 1) { if (num_devices > 1) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32, fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32,
...@@ -1218,36 +1275,56 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1218,36 +1275,56 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
VLOG(10) << "ReduceScatter done"; VLOG(10) << "ReduceScatter done";
// Step 7: update the moment1, moment2. Calcuate the trust_ratio_div // Step 7: update the moment1, moment2. Calcuate the trust_ratio_div
auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
auto *fused_offsets = fused_offsets_t->data<int>();
auto *fp32_partial_fused_offsets_t =
ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
const auto *fp32_partial_fused_offsets =
fp32_partial_fused_offsets_t->data<int>();
auto *fp16_partial_fused_offsets_t =
ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
const auto *fp16_partial_fused_offsets =
fp16_partial_fused_offsets_t->data<int>();
VLOG(1) << "FusedParamOffsets: "
<< FlattenToString(fused_offsets, fused_offsets_t->numel(),
fused_offsets_t->place());
VLOG(1) << "FP32ShardFusedParamOffsets: "
<< FlattenToString(fp32_partial_fused_offsets,
fp32_partial_fused_offsets_t->numel(),
fp32_partial_fused_offsets_t->place());
VLOG(1) << "FP16ShardFusedParamOffsets: "
<< FlattenToString(fp16_partial_fused_offsets,
fp16_partial_fused_offsets_t->numel(),
fp16_partial_fused_offsets_t->place());
memory::Buffer trust_ratio_div_buffer(place); memory::Buffer trust_ratio_div_buffer(place);
auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel); auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel);
auto fp32_offset = rank * fp32_numel_each_device; auto fp32_offset = rank * fp32_numel_each_device;
auto fp16_offset = rank * fp16_numel_each_device; auto fp16_offset = rank * fp16_numel_each_device;
if (has_fp32_param) { if (has_fp32_param) {
auto config =
platform::GetGpuLaunchConfig1D(dev_ctx, fp32_numel_each_device);
VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts"; VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts";
UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0, MultiTensorUpdateLambMomentAndTrustRatioDiv(
stream>>>( dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm, fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm,
global_scale, indices + fp32_offset, weight_decay, beta1pow, beta2pow, global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div,
moment1, moment2, trust_ratio_div, beta1, beta2, epsilon, found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2,
max_global_grad_norm, fp32_numel_each_device, rescale_grad); epsilon, max_global_grad_norm, rescale_grad);
VLOG(10) << "Update FP32 Moment and TrustRatioDiv done"; VLOG(10) << "Update FP32 Moment and TrustRatioDiv done";
} }
float *master_param = nullptr; float *master_param = nullptr;
if (has_fp16_param) { if (has_fp16_param) {
master_param = fp32_param + fp32_numel; master_param = fp32_param + fp32_numel;
auto config =
platform::GetGpuLaunchConfig1D(dev_ctx, fp16_numel_each_device);
VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts"; VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts";
UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0, auto tmp_found_inf = has_fp32_param ? nullptr : found_inf;
stream>>>( MultiTensorUpdateLambMomentAndTrustRatioDiv(
dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm, master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm,
global_scale, indices + fp32_numel + fp16_offset, weight_decay, global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device,
beta1pow, beta2pow, moment1 + fp32_numel_each_device,
moment2 + fp32_numel_each_device, moment2 + fp32_numel_each_device,
trust_ratio_div + fp32_numel_each_device, beta1, beta2, epsilon, trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay,
max_global_grad_norm, fp16_numel_each_device, rescale_grad); fp16_weight_decay_end_idx, beta1, beta2, epsilon,
max_global_grad_norm, rescale_grad);
VLOG(10) << "Update FP16 Moment and TrustRatioDiv done"; VLOG(10) << "Update FP16 Moment and TrustRatioDiv done";
} }
...@@ -1257,30 +1334,6 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1257,30 +1334,6 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
memory::Buffer square_norm_buffer(place); memory::Buffer square_norm_buffer(place);
auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num); auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num);
auto *trust_ratio_div_square_norm = param_square_norm + param_num; auto *trust_ratio_div_square_norm = param_square_norm + param_num;
auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
auto *fused_offsets = fused_offsets_t->data<int>();
auto *fp32_partial_fused_offsets_t =
ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
const auto *fp32_partial_fused_offsets =
fp32_partial_fused_offsets_t->data<int>();
auto *fp16_partial_fused_offsets_t =
ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
const auto *fp16_partial_fused_offsets =
fp16_partial_fused_offsets_t->data<int>();
VLOG(1) << "FusedParamOffsets: "
<< FlattenToString(fused_offsets, fused_offsets_t->numel(),
fused_offsets_t->place());
VLOG(1) << "FP32ShardFusedParamOffsets: "
<< FlattenToString(fp32_partial_fused_offsets,
fp32_partial_fused_offsets_t->numel(),
fp32_partial_fused_offsets_t->place());
VLOG(1) << "FP16ShardFusedParamOffsets: "
<< FlattenToString(fp16_partial_fused_offsets,
fp16_partial_fused_offsets_t->numel(),
fp16_partial_fused_offsets_t->place());
if (num_devices > 1) { if (num_devices > 1) {
if (use_master_param_norm) { if (use_master_param_norm) {
FillZeroWithPtr(param_square_norm + fp32_global_param_num, FillZeroWithPtr(param_square_norm + fp32_global_param_num,
...@@ -1296,11 +1349,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1296,11 +1349,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
fp16_partial_fused_offsets, fp16_local_param_num, fp16_partial_fused_offsets, fp16_local_param_num,
param_square_norm + fp16_local_start_idx); param_square_norm + fp16_local_start_idx);
} else { } else {
// NOTE: extra computation is performed. We can improve this performance
// if needed in the future.
MultiTensorL2Norm( MultiTensorL2Norm(
place, stream, fp16_param, fused_offsets + fp32_global_param_num, place, stream, fp16_param + fused_offsets[fp16_local_start_idx] -
fp16_global_param_num, param_square_norm + fp32_global_param_num); fused_offsets[fp32_global_param_num],
fused_offsets + fp16_local_start_idx, fp16_local_param_num,
param_square_norm + fp16_local_start_idx);
} }
MultiTensorL2Norm(place, stream, trust_ratio_div, MultiTensorL2Norm(place, stream, trust_ratio_div,
...@@ -1333,26 +1386,29 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1333,26 +1386,29 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
// Step 9: update parameter, beta1pow, beta2pow. All gather parameters. // Step 9: update parameter, beta1pow, beta2pow. All gather parameters.
if (has_fp32_param) { if (has_fp32_param) {
LambUpdateParamAndBetaPows<float>( MultiTensorUpdateLambParamAndBetaPows<float>(
dev_ctx, trust_ratio_div, lr, indices + fp32_offset, dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
param_square_norm, trust_ratio_div_square_norm, fp32_square_grad_norm, trust_ratio_div, lr, param_square_norm + fp32_local_start_idx,
&beta1pow, &beta2pow, &found_inf, beta1, beta2, trust_ratio_div_square_norm + fp32_local_start_idx, found_inf,
fp32_numel_each_device, fp32_param + fp32_offset, nullptr, stream); fp32_param + fp32_offset, nullptr, beta1pow, beta2pow, beta1, beta2);
if (num_devices > 1) { if (num_devices > 1) {
// ncclAllGather // ncclAllGather
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
fp32_param + fp32_offset, fp32_param, fp32_numel_each_device, fp32_param + fp32_offset, fp32_param, fp32_numel_each_device,
ncclFloat32, comm, stream)); ncclFloat32, comm, stream));
} }
beta1pow = nullptr;
beta2pow = nullptr;
} }
if (has_fp16_param) { if (has_fp16_param) {
LambUpdateParamAndBetaPows<platform::float16>( MultiTensorUpdateLambParamAndBetaPows<platform::float16>(
dev_ctx, trust_ratio_div + fp32_numel_each_device, lr, dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
indices + fp32_numel + fp16_offset, param_square_norm, trust_ratio_div + fp32_numel_each_device, lr,
trust_ratio_div_square_norm, fp32_square_grad_norm, &beta1pow, param_square_norm + fp16_local_start_idx,
&beta2pow, &found_inf, beta1, beta2, fp16_numel_each_device, trust_ratio_div_square_norm + fp16_local_start_idx, found_inf,
fp16_param + fp16_offset, master_param + fp16_offset, stream); fp16_param + fp16_offset, master_param + fp16_offset, beta1pow,
beta2pow, beta1, beta2);
if (num_devices > 1) { if (num_devices > 1) {
// ncclAllGather // ncclAllGather
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
......
...@@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel( ...@@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel(
args...); args...);
} }
template <typename Functor, int BlockDim, int MaxTensorNumPerLaunch, template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch>
int MaxChunkNumPerLaunch, typename... Args> class MultiTensorLauncher {
static void MultiTensorApply(Functor functor, gpuStream_t stream, public:
const int *offsets, int n, int chunk_size, MultiTensorLauncher(
Args... args) { const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta,
const int &chunk_id, const int &chunk_size, const int &block_dim,
const gpuStream_t &stream)
: meta_(meta),
chunk_id_(chunk_id),
chunk_size_(chunk_size),
block_dim_(block_dim),
stream_(stream) {}
template <typename Functor, typename... Args>
void Launch(Functor &&functor, Args &&... args) const {
MultiTensorApplyCUDAKernel<
Functor, MaxTensorNumPerLaunch,
MaxChunkNumPerLaunch><<<chunk_id_, block_dim_, 0, stream_>>>(
functor, meta_, chunk_size_, args...);
}
private:
const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta_;
const int &chunk_id_;
const int &chunk_size_;
const int &block_dim_;
const gpuStream_t &stream_;
};
template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
typename Callback>
static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets,
int n, int chunk_size, int block_dim,
Callback &&callback) {
if (n == 0) return; if (n == 0) return;
constexpr auto NumTensor = MaxTensorNumPerLaunch; constexpr auto NumTensor = MaxTensorNumPerLaunch;
...@@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, ...@@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
int numel_offset = 0; int numel_offset = 0;
metas.start_tensor_id = 0; metas.start_tensor_id = 0;
metas.start_chunk_id = 0; metas.start_chunk_id = 0;
int launch_num = 0;
MultiTensorLauncher<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> launcher(
metas, chunk_id, chunk_size, block_dim, stream);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
auto length = offsets[i + 1] - offsets[i]; auto length = offsets[i + 1] - offsets[i];
if (tensor_id == 0) { if (tensor_id == 0) {
...@@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, ...@@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
bool last_chunk = (i + 1 == n && j + 1 == chunk_num); bool last_chunk = (i + 1 == n && j + 1 == chunk_num);
if (tensor_full || block_full || last_chunk) { if (tensor_full || block_full || last_chunk) {
MultiTensorApplyCUDAKernel<Functor, NumTensor, callback(launcher, launch_num);
NumChunk><<<chunk_id, BlockDim, 0, stream>>>( ++launch_num;
functor, metas, chunk_size, args...);
chunk_id = 0; chunk_id = 0;
if (j + 1 == chunk_num) { // chunk for the current tensor is full if (j + 1 == chunk_num) { // chunk for the current tensor is full
metas.start_chunk_id = 0; metas.start_chunk_id = 0;
...@@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, ...@@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
} }
} }
template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
typename... Args>
static void MultiTensorApply(Functor functor, gpuStream_t stream,
const int *offsets, int n, int chunk_size,
int block_dim, Args &&... args) {
auto callback = [&](const MultiTensorLauncher<MaxTensorNumPerLaunch,
MaxChunkNumPerLaunch> &launcher,
int i) { launcher.Launch(functor, args...); };
MultiTensorApplyWithCallback<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch>(
stream, offsets, n, chunk_size, block_dim, callback);
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) { ...@@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) {
__device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) { __device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) {
return static_cast<platform::float16>(abs(static_cast<float>(x))); return static_cast<platform::float16>(abs(static_cast<float>(x)));
} }
__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) {
return static_cast<platform::bfloat16>(abs(static_cast<float>(x)));
}
__device__ __forceinline__ float inline_abs(float x) { return abs(x); } __device__ __forceinline__ float inline_abs(float x) { return abs(x); }
__device__ __forceinline__ double inline_abs(double x) { return abs(x); } __device__ __forceinline__ double inline_abs(double x) { return abs(x); }
...@@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow( ...@@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow(
return static_cast<platform::float16>( return static_cast<platform::float16>(
pow(static_cast<float>(base), static_cast<float>(exponent))); pow(static_cast<float>(base), static_cast<float>(exponent)));
} }
__device__ __forceinline__ platform::bfloat16 inline_pow(
platform::bfloat16 base, platform::bfloat16 exponent) {
return static_cast<platform::bfloat16>(
pow(static_cast<float>(base), static_cast<float>(exponent)));
}
__device__ __forceinline__ float inline_pow(float base, float exponent) { __device__ __forceinline__ float inline_pow(float base, float exponent) {
return pow(base, exponent); return pow(base, exponent);
} }
...@@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext; ...@@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(p_norm, REGISTER_OP_CUDA_KERNEL(p_norm,
ops::PnormCUDAKernel<CUDA, paddle::platform::float16>, ops::PnormCUDAKernel<CUDA, paddle::platform::float16>,
ops::PnormCUDAKernel<CUDA, paddle::platform::bfloat16>,
ops::PnormCUDAKernel<CUDA, float>, ops::PnormCUDAKernel<CUDA, float>,
ops::PnormCUDAKernel<CUDA, double>); ops::PnormCUDAKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
p_norm_grad, ops::PnormGradCUDAKernel<CUDA, paddle::platform::float16>, p_norm_grad, ops::PnormGradCUDAKernel<CUDA, paddle::platform::float16>,
ops::PnormGradCUDAKernel<CUDA, paddle::platform::bfloat16>,
ops::PnormGradCUDAKernel<CUDA, float>, ops::PnormGradCUDAKernel<CUDA, float>,
ops::PnormGradCUDAKernel<CUDA, double>); ops::PnormGradCUDAKernel<CUDA, double>);
...@@ -23,6 +23,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -23,6 +23,7 @@ REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, CUDAReduceSumGradKernel<bool>, reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>, CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
CUDAReduceSumGradKernel<paddle::platform::float16>, CUDAReduceSumGradKernel<paddle::platform::float16>,
CUDAReduceSumGradKernel<paddle::platform::bfloat16>,
CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>, CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
CUDAReduceSumGradKernel<paddle::platform::complex<float>>, CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
CUDAReduceSumGradKernel<paddle::platform::complex<double>>); CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
...@@ -15,6 +15,9 @@ limitations under the License. */ ...@@ -15,6 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/split_op.h" #include "paddle/fluid/operators/split_op.h"
#include <string> #include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::Tensor; using framework::Tensor;
...@@ -23,52 +26,6 @@ class SplitOp : public framework::OperatorWithKernel { ...@@ -23,52 +26,6 @@ class SplitOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of SplitOp should not be null."));
PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
platform::errors::InvalidArgument(
"Outputs(Out) of SplitOp should not be empty."));
auto in_dims = ctx->GetInputDim("X");
auto outs_names = ctx->Outputs("Out");
size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
std::vector<int> sections = static_cast<std::vector<int>>(
ctx->Attrs().Get<std::vector<int>>("sections"));
const size_t outs_number = outs_names.size();
if (sections.size() > 0) {
PADDLE_ENFORCE_EQ(
sections.size(), outs_number,
platform::errors::InvalidArgument("tensor split sections size "
"should be equal to output size."));
}
if (ctx->HasInput("AxisTensor")) {
auto out_dims = phi::make_ddim(std::vector<int>(in_dims.size(), -1));
std::vector<framework::DDim> outs_dims(outs_number, out_dims);
ctx->SetOutputsDim("Out", outs_dims);
for (size_t i = 0; i < outs_number; ++i) {
ctx->ShareLoD("X", "Out", 0, i);
}
return;
}
bool each_section_is_known =
(sections.size() > 0 && !ctx->HasInputs("SectionsTensorList"));
auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known,
in_dims, num, sections, axis, outs_number);
ctx->SetOutputsDim("Out", outs_dims);
if (axis != 0) {
// Only pass LoD when not spliting along the first dim.
for (size_t i = 0; i < outs_number; ++i) {
ctx->ShareLoD("X", "Out", 0, i);
}
}
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
...@@ -168,6 +125,10 @@ Example: ...@@ -168,6 +125,10 @@ Example:
namespace ops = paddle::operators; namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor,
PT_INFER_META(phi::SplitInferMeta));
REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker,
ops::SplitGradMaker<paddle::framework::OpDesc>, ops::SplitGradMaker<paddle::framework::OpDesc>,
ops::SplitGradMaker<paddle::imperative::OpBase>); ops::SplitGradMaker<paddle::imperative::OpBase>,
SplitInferShapeFunctor);
...@@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::SumKernel<paddle::platform::CUDADeviceContext, double>, ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int>, ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>); ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
...@@ -281,10 +281,6 @@ REGISTER_OPERATOR( ...@@ -281,10 +281,6 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
paddle::operators::UniformRandomOpVarTypeInference); paddle::operators::UniformRandomOpVarTypeInference);
REGISTER_OP_CPU_KERNEL(
uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
paddle::operators::CPUUniformRandomKernel<double>,
paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
uniform_random_batch_size_like, uniform_random_batch_size_like,
paddle::operators::CPUUniformRandomKernel<float>, paddle::operators::CPUUniformRandomKernel<float>,
......
...@@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> { ...@@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(uniform_random,
paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>);
REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like, REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
paddle::operators::GPUUniformRandomKernel<float>, paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>); paddle::operators::GPUUniformRandomKernel<double>);
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/where_op.h" #include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel { ...@@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "Where");
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Where");
OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Where");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Where");
auto cond_dims = ctx->GetInputDim("Condition");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(
cond_dims, x_dims,
platform::errors::InvalidArgument(
"The dims of Inputs(Condition) and Inputs(X) should be same. "
"But received Condition's shape is [%s], X's shape is [%s]",
cond_dims, x_dims));
PADDLE_ENFORCE_EQ(x_dims, y_dims,
platform::errors::InvalidArgument(
"The dims of Inputs(X) and Inputs(Y) should be same. "
"But received X's shape is [%s], Y's shape is [%s]",
x_dims, y_dims));
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y"); ...@@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
PT_INFER_META(phi::WhereInferMeta));
REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker, REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
ops::WhereOpGradMaker<paddle::framework::OpDesc>, ops::WhereOpGradMaker<paddle::framework::OpDesc>,
ops::WhereOpGradMaker<paddle::imperative::OpBase>); ops::WhereOpGradMaker<paddle::imperative::OpBase>,
WhereInferShapeFunctor);
REGISTER_OPERATOR(where_grad, ops::WhereGradOp, REGISTER_OPERATOR(where_grad, ops::WhereGradOp,
ops::WhereGradNoNeedBufferVarsInferer); ops::WhereGradNoNeedBufferVarsInferer);
REGISTER_OP_CPU_KERNEL(
where, ops::WhereKernel<paddle::platform::CPUDeviceContext, float>,
ops::WhereKernel<paddle::platform::CPUDeviceContext, double>,
ops::WhereKernel<paddle::platform::CPUDeviceContext, int>,
ops::WhereKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
where_grad, ops::WhereGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::WhereGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/operators/where_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
namespace platform = paddle::platform;
namespace paddle {
namespace operators {
template <typename T>
struct CondFunctor {
HOSTDEVICE inline CondFunctor() {}
HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const {
return cond ? x : y;
}
};
template <typename T>
__global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x,
const T* y, T* out) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < N; idx += blockDim.x * gridDim.x) {
out[idx] = cond[idx] ? x[idx] : y[idx];
}
}
template <typename T>
__global__ void WhereGradCUDAKernel(const int N, const T* dout,
const bool* cond, T* dx, T* dy) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < N; idx += blockDim.x * gridDim.x) {
if (dx != nullptr) {
dx[idx] = cond[idx] ? dout[idx] : 0.;
}
if (dy != nullptr) {
dy[idx] = cond[idx] ? 0. : dout[idx];
}
}
}
template <typename T>
class WhereKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::Tensor>("Condition");
auto* X = context.Input<framework::Tensor>("X");
auto* Y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
auto numel = condition->numel();
// TODO(GaaoWei8): Input of where can be broadcast
const bool* cond_data = condition->data<bool>();
const T* x_data = X->data<T>();
const T* y_data = Y->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
auto stream = context.cuda_device_context().stream();
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
auto functor = CondFunctor<T>();
std::vector<const framework::Tensor*> ins = {condition, X, Y};
std::vector<framework::Tensor*> outs = {out};
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
};
template <typename T>
class WhereGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::Tensor>("Condition");
const bool* cond_data = condition->data<bool>();
auto numel = condition->numel();
auto* dout_t =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
auto* dout = dout_t->data<T>();
T* dx =
(dx_t != nullptr) ? dx_t->mutable_data<T>(context.GetPlace()) : nullptr;
T* dy =
(dy_t != nullptr) ? dy_t->mutable_data<T>(context.GetPlace()) : nullptr;
auto stream = context.cuda_device_context().stream();
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
auto config = GetGpuLaunchConfig1D(dev_ctx, condition->numel());
WhereGradCUDAKernel<
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
numel, dout, cond_data, dx, dy);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
where, paddle::operators::WhereKernel<platform::CUDADeviceContext, float>,
paddle::operators::WhereKernel<platform::CUDADeviceContext, double>,
paddle::operators::WhereKernel<platform::CUDADeviceContext, int>,
paddle::operators::WhereKernel<platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
where_grad,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, float>,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, double>,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int>,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int64_t>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class WhereKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::Tensor>("Condition");
auto* X = context.Input<framework::Tensor>("X");
auto* Y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
const bool* cond_data = condition->data<bool>();
const T* x_data = X->data<T>();
const T* y_data = Y->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
auto x_numel = X->numel();
for (int i = 0; i < x_numel; i++) {
out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
}
}
};
template <typename DeviceContext, typename T>
class WhereGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::LoDTensor>("Condition");
const auto* cond_data = condition->data<bool>();
auto numel = condition->numel();
auto* dout_t =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
auto* dout = dout_t->data<T>();
if (dx_t != nullptr) {
auto* dx = dx_t->mutable_data<T>(context.GetPlace());
for (int i = 0; i < numel; i++) {
dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
}
}
if (dy_t != nullptr) {
auto* dy = dy_t->mutable_data<T>(context.GetPlace());
for (int i = 0; i < numel; i++) {
dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/where_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle { namespace paddle {
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/where_op.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#endif #endif
#include <stdio.h> #include <stdio.h>
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
...@@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock( ...@@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
#endif #endif
#endif #endif
// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
bfloat16 low_half;
// the bfloat16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
return (val & 0xFFFF0000u) | low_half.x;
}
inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
bfloat16 high_half;
// the bfloat16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}
#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
return *reinterpret_cast<bfloat16 *>(&x);
}
static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
return *reinterpret_cast<__nv_bfloat16 *>(&x);
}
CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
PDBF16ToCUDABF16(val)));
}
#else
CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
// concrete packed bfloat16 value may exsits in lower or higher 16bits
// of the 32bits address.
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t sum;
uint32_t newval;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// the bfloat16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed,
bf16_add_to_low_half(assumed, val_f));
} while (old != assumed);
bfloat16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// the bfloat16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed,
bf16_add_to_high_half(assumed, val_f));
} while (old != assumed);
bfloat16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif
CUDA_ATOMIC_WRAPPER(Add, complex<float>) { CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
float *real = reinterpret_cast<float *>(address); float *real = reinterpret_cast<float *>(address);
float *imag = real + 1; float *imag = real + 1;
......
...@@ -81,7 +81,7 @@ set(PYBIND_SRCS ...@@ -81,7 +81,7 @@ set(PYBIND_SRCS
cuda_streams_py.cc) cuda_streams_py.cc)
if(NOT ON_INFER) if(NOT ON_INFER)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
if (WITH_NCCL) if (WITH_NCCL)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
endif() endif()
......
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/Types.h" #include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/distributed/collective/reducer.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
...@@ -143,6 +144,19 @@ void BindDistributed(py::module *m) { ...@@ -143,6 +144,19 @@ void BindDistributed(py::module *m) {
[](distributed::ProcessGroupStrategy &self, int nrings) { [](distributed::ProcessGroupStrategy &self, int nrings) {
self.nrings_ = nrings; self.nrings_ = nrings;
}); });
m->def("eager_assign_group_by_size",
[](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
std::vector<size_t> group_size_limits,
std::vector<int64_t> tensor_indices) {
auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
return distributed::Eager_AssignGroupBySize(
tensors, is_sparse_gradient, group_size_limits, tensor_indices);
},
py::arg("tensors"), py::arg("is_sparse_gradient"),
py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
py::arg("tensor_indices") = std::vector<int64_t>{},
py::call_guard<py::gil_scoped_release>());
} }
} // end namespace pybind } // end namespace pybind
......
...@@ -60,7 +60,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -60,7 +60,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}}, {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
{"merged_momentum", {"merged_momentum",
{"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}}, {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
{"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, {"sparse_momentum",
{"Param", "Grad", "Velocity", "Index", "LearningRate", "MasterParam"}},
{"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
{"run_program", {"X", "Params"}}, {"run_program", {"X", "Params"}},
{"fused_feedforward", {"fused_feedforward",
...@@ -124,7 +125,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = { ...@@ -124,7 +125,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
{"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}}, {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"rnn", {"DropoutState", "Reserve", "Out", "State"}}, {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
{"run_program", {"DOut"}}, {"run_program", {"DOut"}},
{"adam", {"adam",
...@@ -181,7 +182,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = { ...@@ -181,7 +182,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
"out_old_num_accumulates", "out_num_updates"}}, "out_old_num_accumulates", "out_num_updates"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}}, {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"batch_norm", {"MeanOut", "VarianceOut"}}, {"batch_norm", {"MeanOut", "VarianceOut"}},
{"sync_batch_norm", {"MeanOut", "VarianceOut"}}, {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
{"accuracy", {"Correct", "Total"}}, {"accuracy", {"Correct", "Total"}},
......
...@@ -40,6 +40,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> { ...@@ -40,6 +40,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
); );
} }
// Type Constrait for concrete DenseTensor type.
class DenseTensor<string target, string precision, string layout> :
Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">,
"!infrt.DenseTensor<"#target#","#precision#","#layout#">",
"::infrt::DenseTensorType">;
// Base class for infrt dialect attributes. // Base class for infrt dialect attributes.
class Infrt_Attr<string name, list<Trait> traits = [], class Infrt_Attr<string name, list<Trait> traits = [],
string baseCppClass = "::mlir::Attribute"> string baseCppClass = "::mlir::Attribute">
......
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include "paddle/infrt/dialect/infrt/infrt_dialect.h" #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
#include "paddle/infrt/dialect/infrt_base.h" #include "paddle/infrt/dialect/infrt_base.h"
#include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include "paddle/infrt/dialect/tensor_shape.h" #include "paddle/infrt/dialect/tensor_shape.h"
namespace infrt { namespace infrt {
......
...@@ -2,16 +2,7 @@ if (NOT INFRT_WITH_PHI) ...@@ -2,16 +2,7 @@ if (NOT INFRT_WITH_PHI)
return() return()
endif() endif()
#mlir_tablegen_on(infrt_phi_base DIALECT phi) add_subdirectory(ir)
add_mlir_dialect(infrt_phi_base phi)
add_mlir_dialect(infrt_phi_tensor phi_dt)
add_mlir_dialect(infrt_phi_kernel phi_kernel)
#mlir_tablegen_on(infrt_phi_tensor)
gather_srcs(infrt_src SRCS
phi_base.cc infrt_phi_tensor.cc
infrt_phi_tensor.cc)
add_subdirectory(pass) add_subdirectory(pass)
add_executable(phi-exec phi_exec.cc) add_executable(phi-exec phi_exec.cc)
......
#mlir_tablegen_on(infrt_phi_base DIALECT phi)
add_mlir_dialect(infrt_phi_base phi)
add_mlir_dialect(infrt_phi_tensor phi_dt)
add_mlir_dialect(infrt_phi_kernel phi_kernel)
#mlir_tablegen_on(infrt_phi_tensor)
gather_srcs(infrt_src SRCS
phi_base.cc
infrt_phi_tensor.cc)
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/OpBase.td" include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/infrt_base.td" include "paddle/infrt/dialect/infrt_base.td"
include "paddle/infrt/dialect/phi/infrt_phi_base.td" include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
def PHI_KernelDialect : Dialect { def PHI_KernelDialect : Dialect {
let name = "phi_kernel"; let name = "phi_kernel";
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include <mlir/IR/BuiltinTypes.h> #include <mlir/IR/BuiltinTypes.h>
#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.cpp.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.cpp.inc"
namespace infrt { namespace infrt {
namespace phi { namespace phi {
...@@ -25,7 +25,7 @@ namespace phi { ...@@ -25,7 +25,7 @@ namespace phi {
void PHIDenseTensorDialect::initialize() { void PHIDenseTensorDialect::initialize() {
#define GET_OP_LIST #define GET_OP_LIST
addOperations< addOperations<
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"
>(); >();
} }
...@@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() { ...@@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() {
} // namespace infrt } // namespace infrt
#define GET_OP_CLASSES #define GET_OP_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" // NOLINT #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc" // NOLINT
...@@ -29,11 +29,11 @@ ...@@ -29,11 +29,11 @@
#include <mlir/Interfaces/LoopLikeInterface.h> #include <mlir/Interfaces/LoopLikeInterface.h>
#include <mlir/Interfaces/SideEffectInterfaces.h> #include <mlir/Interfaces/SideEffectInterfaces.h>
#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc"
#include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/phi/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h"
// NOLINT // NOLINT
#define GET_OP_CLASSES #define GET_OP_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#else #else
#define PHI_TENSOR #define PHI_TENSOR
include "paddle/infrt/dialect/phi/infrt_phi_base.td" include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/OpBase.td" include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/infrt_base.td" include "paddle/infrt/dialect/infrt_base.td"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/infrt/dialect/phi/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include <mlir/IR/Builders.h> #include <mlir/IR/Builders.h>
#include <mlir/IR/Dialect.h> #include <mlir/IR/Dialect.h>
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include <mlir/IR/TypeUtilities.h> #include <mlir/IR/TypeUtilities.h>
#include <mlir/IR/Types.h> #include <mlir/IR/Types.h>
#include "paddle/infrt/common/global.h" #include "paddle/infrt/common/global.h"
#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc"
namespace infrt { namespace infrt {
namespace phi { namespace phi {
...@@ -51,11 +51,11 @@ void PHIDialect::printType(::mlir::Type type, ...@@ -51,11 +51,11 @@ void PHIDialect::printType(::mlir::Type type,
void PHIDialect::initialize() { void PHIDialect::initialize() {
addOperations< addOperations<
#define GET_OP_LIST #define GET_OP_LIST
#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" // NOLINT #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" // NOLINT
>(); >();
addTypes< addTypes<
#define GET_TYPEDEF_LIST #define GET_TYPEDEF_LIST
#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT
>(); >();
} }
...@@ -81,4 +81,4 @@ mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const { ...@@ -81,4 +81,4 @@ mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
} // namespace infrt } // namespace infrt
#define GET_TYPEDEF_CLASSES #define GET_TYPEDEF_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT
...@@ -19,11 +19,13 @@ ...@@ -19,11 +19,13 @@
#include <string> #include <string>
#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc"
#define GET_TYPEDEF_CLASSES #define GET_TYPEDEF_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc"
namespace mlir { namespace mlir {
namespace OpTrait { namespace OpTrait {
......
...@@ -73,7 +73,7 @@ using ValueVariantType = ...@@ -73,7 +73,7 @@ using ValueVariantType =
std::vector<phi::DenseTensor>, std::vector<phi::DenseTensor>,
paddle::experimental::ScalarBase<phi::DenseTensor>, paddle::experimental::ScalarBase<phi::DenseTensor>,
paddle::experimental::ScalarArrayBase<phi::DenseTensor>, paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
std::vector<phi::MetaTensor>, std::vector<phi::MetaTensor*>,
phi::MetaConfig, phi::MetaConfig,
paddle::experimental::Backend, paddle::experimental::Backend,
paddle::experimental::DataLayout, paddle::experimental::DataLayout,
......
...@@ -94,12 +94,16 @@ std::vector<Tensor> split_impl(const Tensor& x, ...@@ -94,12 +94,16 @@ std::vector<Tensor> split_impl(const Tensor& x,
std::vector<Tensor> out; std::vector<Tensor> out;
auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out); auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out);
std::vector<phi::MetaTensor> meta_outs; std::vector<phi::MetaTensor> meta_outs;
meta_outs.reserve(out_number);
std::vector<phi::MetaTensor*> meta_out_ptrs;
meta_out_ptrs.reserve(out_number);
for (size_t i = 0; i < out_number; ++i) { for (size_t i = 0; i < out_number; ++i) {
meta_outs.push_back(dense_outs[i]); meta_outs.push_back(dense_outs[i]);
meta_out_ptrs.push_back(&meta_outs.back());
} }
phi::SplitInferMeta( phi::SplitInferMeta(
MakeMetaTensor(*dense_x), num_or_sections, axis, &meta_outs); MakeMetaTensor(*dense_x), num_or_sections, axis, meta_out_ptrs);
using kernel_signature = void (*)(const platform::DeviceContext&, using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&, const phi::DenseTensor&,
......
...@@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList( ...@@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList(
return result; return result;
} }
void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
const phi::TensorArgDef& arg_def) {
VLOG(5) << "ResetTensor by TensorArgDef.";
if (phi::DenseTensor::classof(dst)) {
auto* dense_t = static_cast<phi::DenseTensor*>(dst);
auto* meta = phi::DenseTensorUtils::GetMutableMeta(dense_t);
meta->dtype = arg_def.dtype;
meta->layout = arg_def.layout;
} else if (phi::SelectedRows::classof(dst)) {
auto* selected_rows = static_cast<phi::SelectedRows*>(dst);
auto* meta =
phi::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value());
meta->dtype = arg_def.dtype;
meta->layout = arg_def.layout;
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"Unsupported tensor type is received when reseting tensor dtype and "
"layout by argument definition."));
}
}
} // namespace experimental } // namespace experimental
} // namespace paddle } // namespace paddle
...@@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable); ...@@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable);
phi::ScalarArray MakePhiScalarArrayFromVarList( phi::ScalarArray MakePhiScalarArrayFromVarList(
const std::vector<framework::Variable*>& variable_list); const std::vector<framework::Variable*>& variable_list);
void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
const phi::TensorArgDef& arg_def);
} // namespace experimental } // namespace experimental
} // namespace paddle } // namespace paddle
...@@ -227,4 +227,12 @@ class GPUContext : public DeviceContext { ...@@ -227,4 +227,12 @@ class GPUContext : public DeviceContext {
// must use different function name for cudnn kernel // must use different function name for cudnn kernel
using GPUDNNContext = GPUContext; using GPUDNNContext = GPUContext;
// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
// because we want to implement a KPS-based kernel and make it run
// on GPU and XPU at the same time, so we need KPSContext when registering
// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
using KPSContext = GPUContext;
#endif
} // namespace phi } // namespace phi
...@@ -66,4 +66,12 @@ class XPUContext : public DeviceContext { ...@@ -66,4 +66,12 @@ class XPUContext : public DeviceContext {
std::unique_ptr<Impl> impl_; std::unique_ptr<Impl> impl_;
}; };
// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
// because we want to implement a KPS-based kernel and make it run
// on GPU and XPU at the same time, so we need KPSContext when registering
// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
#if PADDLE_WITH_XPU_KP
using KPSContext = XPUContext;
#endif
} // namespace phi } // namespace phi
...@@ -52,6 +52,9 @@ enum class Backend : uint8_t { ...@@ -52,6 +52,9 @@ enum class Backend : uint8_t {
MKLDNN, MKLDNN,
GPUDNN, // cuDNN and hipDNN GPUDNN, // cuDNN and hipDNN
// paddle kernel primitives backend
KPS,
// end of backend types // end of backend types
NUM_BACKENDS, NUM_BACKENDS,
...@@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { ...@@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
case Backend::GPUDNN: case Backend::GPUDNN:
os << "GPUDNN"; os << "GPUDNN";
break; break;
case Backend::KPS:
os << "KPS";
break;
default: { default: {
size_t device_type_id_ = static_cast<size_t>(backend) - size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(Backend::NUM_BACKENDS); static_cast<size_t>(Backend::NUM_BACKENDS);
...@@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) { ...@@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
return Backend::MKLDNN; return Backend::MKLDNN;
} else if (s == std::string("GPUDNN")) { } else if (s == std::string("GPUDNN")) {
return Backend::GPUDNN; return Backend::GPUDNN;
} else if (s == std::string("KPS")) {
return Backend::KPS;
} else { } else {
return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) + return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
phi::GetOrRegisterGlobalDeviceTypeId(s)); phi::GetOrRegisterGlobalDeviceTypeId(s));
......
...@@ -22,8 +22,8 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_ ...@@ -22,8 +22,8 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_
cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils) cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
......
...@@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { ...@@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
case phi::Backend::XPU: case phi::Backend::XPU:
return phi::XPUPlace( return phi::XPUPlace(
set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
#endif
case phi::Backend::KPS:
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return phi::GPUPlace(
set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
#elif defined(PADDLE_WITH_XPU_KP)
return phi::XPUPlace(
set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
#endif #endif
default: { default: {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
......
...@@ -20,16 +20,16 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { ...@@ -20,16 +20,16 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
auto& kernel_info_map = custom_kernel_map.GetMap(); auto& kernel_info_map = custom_kernel_map.GetMap();
VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size(); VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
auto& kernels = KernelFactory::Instance().kernels();
for (auto& pair : kernel_info_map) { for (auto& pair : kernel_info_map) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_NE(
KernelFactory::Instance().HasCompatiblePhiKernel(pair.first), kernels.find(pair.first),
true, kernels.end(),
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"The kernel %s is not ready for custom kernel registering.", "The kernel %s is not ready for custom kernel registering.",
pair.first)); pair.first));
for (auto& info_pair : pair.second) { for (auto& info_pair : pair.second) {
auto& kernels = KernelFactory::Instance().kernels();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
kernels[pair.first].find(info_pair.first), kernels[pair.first].find(info_pair.first),
kernels[pair.first].end(), kernels[pair.first].end(),
......
...@@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) { ...@@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
// Note: When you reset holder, you need to ensure the offset is correct // Note: When you reset holder, you need to ensure the offset is correct
void DenseTensor::ResetHolder(const std::shared_ptr<phi::Allocation>& holder) { void DenseTensor::ResetHolder(const std::shared_ptr<phi::Allocation>& holder) {
if (holder_) { if (holder_) {
// TODO(zyfncg): The change of static_cast<> in check will recover back
// when SetAllocationForOutputTenosr is deleted.
// Now the numel() may return -1, and will cast to a very large number when
// compare with a data with unsigned long type, this will make checking
// failed, so it's a temporary solution to deal with this problem.
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
numel() * static_cast<int64_t>(SizeOf(dtype())) + numel() * static_cast<int64_t>(SizeOf(dtype())) +
static_cast<int64_t>(meta_.offset), static_cast<int64_t>(meta_.offset),
......
...@@ -75,13 +75,13 @@ paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt( ...@@ -75,13 +75,13 @@ paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt(
: paddle::optional<const phi::MetaTensor&>{paddle::none}; : paddle::optional<const phi::MetaTensor&>{paddle::none};
} }
std::vector<MetaTensor> InferMetaContext::InputsBetween(size_t start, std::vector<MetaTensor*> InferMetaContext::InputsBetween(size_t start,
size_t end) const { size_t end) const {
std::vector<MetaTensor> result; std::vector<MetaTensor*> result;
result.reserve(end - start); result.reserve(end - start);
for (size_t i = start; i < end; ++i) { for (size_t i = start; i < end; ++i) {
result.emplace_back(*inputs_.at(i)); result.push_back(inputs_.at(i).get());
} }
return result; return result;
...@@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) { ...@@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) {
return outputs_.at(idx).get(); return outputs_.at(idx).get();
} }
std::vector<MetaTensor> InferMetaContext::MutableOutputBetween(size_t start, std::vector<MetaTensor*> InferMetaContext::MutableOutputBetween(size_t start,
size_t end) { size_t end) {
std::vector<MetaTensor> result; std::vector<MetaTensor*> result;
result.reserve(end - start); result.reserve(end - start);
for (size_t i = start; i < end; ++i) { for (size_t i = start; i < end; ++i) {
result.emplace_back(*outputs_.at(i)); result.emplace_back(outputs_.at(i).get());
} }
return result; return result;
} }
......
...@@ -50,13 +50,13 @@ class InferMetaContext { ...@@ -50,13 +50,13 @@ class InferMetaContext {
const std::pair<int, int>& OutputRangeAt(size_t idx) const; const std::pair<int, int>& OutputRangeAt(size_t idx) const;
const MetaConfig& GetMetaConfig() const; const MetaConfig& GetMetaConfig() const;
const MetaTensor& InputAt(size_t idx) const;
const MetaTensor& InputAt(size_t idx) const;
paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const; paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const;
std::vector<MetaTensor*> InputsBetween(size_t start, size_t end) const;
std::vector<MetaTensor> InputsBetween(size_t start, size_t end) const;
MetaTensor* MutableOutputAt(size_t idx); MetaTensor* MutableOutputAt(size_t idx);
std::vector<MetaTensor> MutableOutputBetween(size_t start, size_t end); std::vector<MetaTensor*> MutableOutputBetween(size_t start, size_t end);
template <typename AttrType> template <typename AttrType>
AttrType AttrAt(size_t idx) { AttrType AttrAt(size_t idx) {
...@@ -157,7 +157,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> { ...@@ -157,7 +157,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
}; };
template <typename... Tail> template <typename... Tail>
struct InferMetaFnCallHelper<const std::vector<MetaTensor>&, Tail...> { struct InferMetaFnCallHelper<const std::vector<MetaTensor*>&, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs> template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
static_assert(attr_idx == 0, static_assert(attr_idx == 0,
...@@ -165,7 +165,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> { ...@@ -165,7 +165,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
static_assert(out_idx == 0, static_assert(out_idx == 0,
"InferMeta's Input should appear before Outputs."); "InferMeta's Input should appear before Outputs.");
const std::pair<int, int> range = ctx->InputRangeAt(in_idx); const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
std::vector<MetaTensor> arg = std::vector<MetaTensor*> arg =
ctx->InputsBetween(range.first, range.second); ctx->InputsBetween(range.first, range.second);
InferMetaFnCallHelper< InferMetaFnCallHelper<
Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx, Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
...@@ -210,13 +210,12 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> { ...@@ -210,13 +210,12 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
}; };
template <typename... Tail> template <typename... Tail>
struct InferMetaFnCallHelper<std::vector<MetaTensor>*, Tail...> { struct InferMetaFnCallHelper<std::vector<MetaTensor*>, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs> template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
const std::pair<int, int> range = ctx->OutputRangeAt(out_idx); const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
std::vector<MetaTensor> tmp = std::vector<MetaTensor*> arg =
ctx->MutableOutputBetween(range.first, range.second); ctx->MutableOutputBetween(range.first, range.second);
std::vector<MetaTensor>* arg = &tmp;
InferMetaFnCallHelper< InferMetaFnCallHelper<
Tail...>::template Call<in_idx, attr_idx, out_idx + 1>(ctx, Tail...>::template Call<in_idx, attr_idx, out_idx + 1>(ctx,
pargs..., pargs...,
......
...@@ -87,13 +87,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> { ...@@ -87,13 +87,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
default_tensor_layout, default_tensor_layout,
default_key.dtype(), default_key.dtype(),
arg_type); arg_type);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
} else if (arg_type == std::type_index(typeid(const SelectedRows&))) { } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
args_def->AppendInput(default_key.backend(), args_def->AppendInput(default_key.backend(),
default_tensor_layout, default_tensor_layout,
default_key.dtype(), default_key.dtype(),
arg_type); arg_type);
#endif
} else if (arg_type == std::type_index(typeid(DenseTensor*))) { } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
args_def->AppendOutput(default_key.backend(), args_def->AppendOutput(default_key.backend(),
default_tensor_layout, default_tensor_layout,
...@@ -105,13 +103,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> { ...@@ -105,13 +103,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
default_tensor_layout, default_tensor_layout,
default_key.dtype(), default_key.dtype(),
arg_type); arg_type);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
} else if (arg_type == std::type_index(typeid(SelectedRows*))) { } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
args_def->AppendOutput(default_key.backend(), args_def->AppendOutput(default_key.backend(),
default_tensor_layout, default_tensor_layout,
default_key.dtype(), default_key.dtype(),
arg_type); arg_type);
#endif
} else { } else {
// Attribute deal with // Attribute deal with
// TODO(chenweihang): now here allow any types of attribute, maybe // TODO(chenweihang): now here allow any types of attribute, maybe
......
...@@ -23,9 +23,7 @@ ...@@ -23,9 +23,7 @@
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_context.h"
#ifndef PADDLE_WITH_CUSTOM_KERNEL
#include "paddle/phi/core/selected_rows.h" #include "paddle/phi/core/selected_rows.h"
#endif
#include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/type_defs.h" #include "paddle/phi/core/type_defs.h"
...@@ -223,9 +221,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> { ...@@ -223,9 +221,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
#endif
PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
...@@ -260,9 +256,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> { ...@@ -260,9 +256,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows); PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
#endif
PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
......
...@@ -23,13 +23,6 @@ limitations under the License. */ ...@@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/utils/any.h" #include "paddle/utils/any.h"
#include "paddle/utils/optional.h" #include "paddle/utils/optional.h"
// Note: mixed_vector include many header now, LoD will be
// used on CUDA device? Can we use small_vector here?
// @zhanlve: Rollback to original LoD for now
#ifndef PADDLE_WITH_CUSTOM_KERNEL
#include "paddle/fluid/framework/mixed_vector.h"
#endif
namespace phi { namespace phi {
using DDim = phi::DDim; using DDim = phi::DDim;
......
...@@ -225,6 +225,41 @@ void HuberLossInferMeta(const MetaTensor& input, ...@@ -225,6 +225,41 @@ void HuberLossInferMeta(const MetaTensor& input,
out->share_lod(input); out->share_lod(input);
} }
void IndexSampleInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config) {
auto input_dims = x.dims();
PADDLE_ENFORCE_EQ(input_dims.size(),
2,
errors::InvalidArgument(
"Inputs(X) shape of IndexSample op should be 2-D, but "
"got X's shape = [%s], please check X shape.",
input_dims));
auto index_dims = y.dims();
PADDLE_ENFORCE_EQ(
index_dims.size(),
2,
errors::InvalidArgument(
"Inputs(Index) shape of IndexSample op should be 2-D, but "
"got Index's shape [%s] , please check index shape.",
input_dims));
if (config.is_runtime) {
PADDLE_ENFORCE_EQ(input_dims[0],
index_dims[0],
errors::InvalidArgument(
"Inputs(X)'s value of dimension 0 must same with "
"Inputs(Index)'s value of dimension 0, but "
"got %d of Inputs(X), and got %d of Inputs(Index), "
"please check Inputs shape.",
input_dims[0],
index_dims[0]));
}
out->set_dtype(x.dtype());
out->set_dims(index_dims);
out->share_lod(y);
}
void CrossInferMeta(const MetaTensor& x, void CrossInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
int axis, int axis,
...@@ -271,8 +306,7 @@ void CrossInferMeta(const MetaTensor& x, ...@@ -271,8 +306,7 @@ void CrossInferMeta(const MetaTensor& x,
} }
void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
auto in_dims = x.dims(); out->share_meta(x);
out->set_dims(in_dims);
} }
void BCELossInferMeta(const MetaTensor& input, void BCELossInferMeta(const MetaTensor& input,
......
...@@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta, ...@@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
MetaTensor* residual, MetaTensor* residual,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void IndexSampleInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config = MetaConfig());
void CrossInferMeta(const MetaTensor& x, void CrossInferMeta(const MetaTensor& x,
const MetaTensor& y, const MetaTensor& y,
int axis, int axis,
......
...@@ -84,7 +84,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, ...@@ -84,7 +84,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
out->set_dtype(x.dtype()); out->set_dtype(x.dtype());
} }
void ConcatInferMeta(const std::vector<MetaTensor>& x, void ConcatInferMeta(const std::vector<MetaTensor*>& x,
const Scalar& axis_scalar, const Scalar& axis_scalar,
MetaTensor* out, MetaTensor* out,
MetaConfig config) { MetaConfig config) {
...@@ -93,10 +93,19 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x, ...@@ -93,10 +93,19 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"The size of input meta vector should be greater" "The size of input meta vector should be greater"
"than 0.")); "than 0."));
if (axis_scalar.FromTensor()) {
auto out_dims =
phi::make_ddim(std::vector<int>(x.at(0)->dims().size(), -1));
out->set_dims(out_dims);
out->set_dtype(x.at(0)->dtype());
out->set_layout(x.at(0)->layout());
out->share_lod(*x.at(0));
return;
}
int axis = axis_scalar.to<int>(); int axis = axis_scalar.to<int>();
// 1. calculate axis // 1. calculate axis
int rank = x.at(0).dims().size(); int rank = x.at(0)->dims().size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
axis >= -rank && axis < rank, axis >= -rank && axis < rank,
true, true,
...@@ -111,15 +120,42 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x, ...@@ -111,15 +120,42 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
// 2. calculate out dims // 2. calculate out dims
std::vector<phi::DDim> x_dims; std::vector<phi::DDim> x_dims;
for (auto& x_t : x) { x_dims.reserve(x.size());
x_dims.push_back(x_t.dims()); for (const auto* x_t : x) {
x_dims.emplace_back(x_t->dims());
} }
phi::DDim out_dim = phi::DDim out_dim =
phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis); phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis);
out->set_dims(out_dim); out->set_dims(out_dim);
out->set_dtype(x.at(0).dtype()); out->set_dtype(x.at(0)->dtype());
out->set_layout(x.at(0).layout()); out->set_layout(x.at(0)->layout());
out->share_lod(*x.at(0));
}
void WhereInferMeta(const MetaTensor& condition,
const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out) {
auto cond_dims = condition.dims();
auto x_dims = x.dims();
auto y_dims = y.dims();
PADDLE_ENFORCE_EQ(
cond_dims,
x_dims,
phi::errors::InvalidArgument(
"The dims of Inputs(Condition) and Inputs(X) should be same. "
"But received Condition's shape is [%s], X's shape is [%s]",
cond_dims,
x_dims));
PADDLE_ENFORCE_EQ(x_dims,
y_dims,
phi::errors::InvalidArgument(
"The dims of Inputs(X) and Inputs(Y) should be same. "
"But received X's shape is [%s], Y's shape is [%s]",
x_dims,
y_dims));
out->share_meta(x);
} }
} // namespace phi } // namespace phi
...@@ -25,9 +25,13 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, ...@@ -25,9 +25,13 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void ConcatInferMeta(const std::vector<MetaTensor>& x, void ConcatInferMeta(const std::vector<MetaTensor*>& x,
const Scalar& axis_scalar, const Scalar& axis_scalar,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void WhereInferMeta(const MetaTensor& condition,
const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out);
} // namespace phi } // namespace phi
...@@ -459,8 +459,19 @@ void TransferLayoutInferMeta(const MetaTensor& x, ...@@ -459,8 +459,19 @@ void TransferLayoutInferMeta(const MetaTensor& x,
void SplitInferMeta(const MetaTensor& x, void SplitInferMeta(const MetaTensor& x,
const ScalarArray& num_or_sections, const ScalarArray& num_or_sections,
const Scalar& axis, const Scalar& axis,
std::vector<MetaTensor>* out, std::vector<MetaTensor*> out,
MetaConfig config) { MetaConfig config) {
if (!config.is_runtime) {
if (axis.FromTensor() || num_or_sections.FromTensor()) {
auto out_dims = phi::make_ddim(std::vector<int>(x.dims().size(), -1));
for (auto* item : out) {
item->set_dims(out_dims);
item->share_lod(x);
}
return;
}
}
int axis_value = axis.to<int>(); int axis_value = axis.to<int>();
int rank = x.dims().size(); int rank = x.dims().size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -475,27 +486,34 @@ void SplitInferMeta(const MetaTensor& x, ...@@ -475,27 +486,34 @@ void SplitInferMeta(const MetaTensor& x,
axis_value = axis_value + rank; axis_value = axis_value + rank;
} }
std::vector<phi::DDim> out_dims(out.size(), x.dims());
auto input_axis_dim = x.dims().at(axis_value); auto input_axis_dim = x.dims().at(axis_value);
auto num_or_sections_data = num_or_sections.GetData(); auto num_or_sections_data = num_or_sections.GetData();
// step1: get formated sections
std::vector<int64_t> sections;
// num_or_sections is a number // num_or_sections is a number
if (num_or_sections_data.size() == 1) { if (num_or_sections_data.size() == 1) {
int num = num_or_sections_data.at(0); if (config.is_runtime || input_axis_dim > 0) {
int num = num_or_sections_data.at(0);
PADDLE_ENFORCE_EQ(
input_axis_dim % num,
0,
phi::errors::InvalidArgument(
"The input's size along the split dimension "
"must be evenly divisible by Attr(num_or_sections). "
"But received Attr(num_or_sections) "
"= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
num,
x.dims(),
axis_value));
PADDLE_ENFORCE_EQ(input_axis_dim % num, size_t out_axis_dim = input_axis_dim / num;
0, for (auto& out_dim : out_dims) {
phi::errors::InvalidArgument( out_dim[axis_value] = out_axis_dim;
"The input's size along the split dimension " }
"must be evenly divisible by Attr(num_or_sections). " } else {
"But received Attr(num_or_sections) " for (auto& out_dim : out_dims) {
"= %d, input(X)'s shape = [%s], Attr(dim) = %d.", out_dim[axis_value] = -1;
num, }
x.dims(),
axis_value));
for (int i = 0; i < num; ++i) {
sections.push_back(input_axis_dim / num);
} }
} else { } else {
// num_or_sections is a sections // num_or_sections is a sections
...@@ -503,10 +521,9 @@ void SplitInferMeta(const MetaTensor& x, ...@@ -503,10 +521,9 @@ void SplitInferMeta(const MetaTensor& x,
int unknow_dim_idx = -1; int unknow_dim_idx = -1;
int num_of_unknow = 0; int num_of_unknow = 0;
int sum_of_section = 0; int sum_of_section = 0;
std::vector<int64_t> sections = num_or_sections_data;
for (size_t i = 0; i < num_or_sections_data.size(); ++i) { for (size_t i = 0; i < num_or_sections_data.size(); ++i) {
sections.push_back(num_or_sections_data[i]);
if (num_or_sections_data[i] == unknow_dim_val) { if (num_or_sections_data[i] == unknow_dim_val) {
num_of_unknow++; num_of_unknow++;
unknow_dim_idx = i; unknow_dim_idx = i;
...@@ -558,31 +575,22 @@ void SplitInferMeta(const MetaTensor& x, ...@@ -558,31 +575,22 @@ void SplitInferMeta(const MetaTensor& x,
x.dims(), x.dims(),
axis_value)); axis_value));
} }
} for (size_t i = 0; i < out_dims.size(); ++i) {
// setp2: fill out dims
std::vector<phi::DDim> out_dims(sections.size(), x.dims());
if (config.is_runtime || input_axis_dim > 0) {
for (size_t i = 0; i < sections.size(); ++i) {
out_dims[i][axis_value] = sections[i]; out_dims[i][axis_value] = sections[i];
} }
} else {
for (size_t i = 0; i < sections.size(); ++i) {
out_dims[i][axis_value] = -1;
}
} }
for (size_t i = 0; i < sections.size(); ++i) { for (size_t i = 0; i < out.size(); ++i) {
if (axis_value != 0) { if (axis_value != 0) {
// Only pass LoD when not spliting along the first dim. // Only pass LoD when not spliting along the first dim.
(*out)[i].set_dtype(x.dtype()); out.at(i)->set_dtype(x.dtype());
(*out)[i].set_dims(out_dims[i]); out.at(i)->set_dims(out_dims[i]);
(*out)[i].set_layout(x.layout()); out.at(i)->set_layout(x.layout());
} else { } else {
(*out)[i].set_dtype(x.dtype()); out.at(i)->set_dtype(x.dtype());
(*out)[i].set_dims(out_dims[i]); out.at(i)->set_dims(out_dims[i]);
(*out)[i].set_layout(x.layout()); out.at(i)->set_layout(x.layout());
(*out)[i].share_lod(x); out.at(i)->share_lod(x);
} }
} }
} }
......
...@@ -107,7 +107,7 @@ void TransferLayoutInferMeta(const MetaTensor& x, ...@@ -107,7 +107,7 @@ void TransferLayoutInferMeta(const MetaTensor& x,
void SplitInferMeta(const MetaTensor& x_meta, void SplitInferMeta(const MetaTensor& x_meta,
const ScalarArray& num_or_sections, const ScalarArray& num_or_sections,
const Scalar& axis, const Scalar& axis,
std::vector<MetaTensor>* out, std::vector<MetaTensor*> out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void UnbindInferMeta(const MetaTensor& x, void UnbindInferMeta(const MetaTensor& x,
......
...@@ -31,13 +31,16 @@ DenseTensor Concat(const Context& dev_ctx, ...@@ -31,13 +31,16 @@ DenseTensor Concat(const Context& dev_ctx,
const std::vector<DenseTensor>& x, const std::vector<DenseTensor>& x,
const Scalar& axis) { const Scalar& axis) {
std::vector<MetaTensor> meta_x; std::vector<MetaTensor> meta_x;
meta_x.reserve(x.size());
std::vector<MetaTensor*> meta_x_ptr;
for (const auto& t : x) { for (const auto& t : x) {
meta_x.emplace_back(t); meta_x.emplace_back(t);
meta_x_ptr.push_back(&meta_x.back());
} }
auto dense_out = phi::Empty<T, Context>(dev_ctx); auto dense_out = phi::Empty<T, Context>(dev_ctx);
MetaTensor meta_out(&dense_out); MetaTensor meta_out(&dense_out);
ConcatInferMeta(meta_x, axis.to<int>(), &meta_out, /*is_runtime=*/true); ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out); ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
return dense_out; return dense_out;
} }
......
...@@ -12,11 +12,10 @@ ...@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/atan2_grad_kernel.h" #include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
PD_REGISTER_KERNEL(atan2_grad, PD_REGISTER_KERNEL(atan2_grad,
CPU, CPU,
......
...@@ -12,11 +12,10 @@ ...@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/atan2_kernel.h" #include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
PD_REGISTER_KERNEL(atan2, PD_REGISTER_KERNEL(atan2,
CPU, CPU,
......
...@@ -37,6 +37,7 @@ void ConcatKernel(const Context& dev_ctx, ...@@ -37,6 +37,7 @@ void ConcatKernel(const Context& dev_ctx,
axis = phi::funcs::ComputeAxis(axis, x[0].dims().size()); axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
std::vector<phi::DDim> x_dims; std::vector<phi::DDim> x_dims;
x_dims.reserve(x.size());
for (size_t i = 0; i < x.size(); ++i) { for (size_t i = 0; i < x.size(); ++i) {
x_dims.push_back(x[i].dims()); x_dims.push_back(x[i].dims());
} }
...@@ -97,9 +98,10 @@ void ConcatKernel(const Context& dev_ctx, ...@@ -97,9 +98,10 @@ void ConcatKernel(const Context& dev_ctx,
} }
} else { } else {
std::vector<phi::DenseTensor> inputs; std::vector<phi::DenseTensor> inputs;
inputs.reserve(x.size());
for (size_t j = 0; j < x.size(); ++j) { for (size_t j = 0; j < x.size(); ++j) {
if (x[j].numel() > 0) { if (x[j].numel() > 0) {
inputs.push_back(x[j]); inputs.emplace_back(x[j]);
} else { } else {
continue; continue;
} }
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_sample_grad_kernel.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context, typename IndexT = int>
void IndexSampleGradInner(const Context& context,
const DenseTensor& out_grad,
const DenseTensor& index,
DenseTensor* x_grad) {
std::vector<T> out_grad_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(out_grad, context, &out_grad_vec);
paddle::framework::TensorToVector(index, context, &index_vec);
auto index_dims = index.dims();
auto x_grad_dims = x_grad->dims();
auto value_length = x_grad_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> x_grad_vec(x_grad->numel(), 0);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i],
0,
errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length,
index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i],
value_length,
errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length,
index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
x_grad_vec[v_i] += out_grad_vec[i];
}
context.template Alloc<T>(x_grad);
paddle::framework::TensorFromVector(x_grad_vec, context, x_grad);
x_grad->Resize(x_grad_dims);
}
template <typename T, typename Context>
void IndexSampleGradKernel(const Context& ctx,
const DenseTensor& out_grad,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* x_grad) {
auto index_type = index.dtype();
bool index_type_match =
index_type == DataType::INT32 || index_type == DataType::INT64;
PADDLE_ENFORCE_EQ(
index_type_match,
true,
errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(index_type)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT32)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType((DataType::INT64)))));
if (index_type == DataType::INT32) {
IndexSampleGradInner<T, Context, int>(ctx, out_grad, index, x_grad);
} else if (index_type == DataType::INT64) {
IndexSampleGradInner<T, Context, int64_t>(ctx, out_grad, index, x_grad);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_sample_grad,
CPU,
ALL_LAYOUT,
phi::IndexSampleGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_sample_kernel.h"
#include <cmath>
#include <fstream>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context, typename IndexT = int>
void IndexSampleInner(const Context &context,
const DenseTensor &input,
const DenseTensor &index,
DenseTensor *output) {
auto input_dims = input.dims();
auto index_dims = index.dims();
int batch_size = input_dims[0];
auto value_length = input_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> input_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(input, context, &input_vec);
paddle::framework::TensorToVector(index, context, &index_vec);
std::vector<T> res(index_ids_num);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i],
0,
errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length,
index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i],
value_length,
errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length,
index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
T v = input_vec[v_i];
VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
<< " value = " << v;
res[i] = v;
}
auto ddim = phi::make_ddim({batch_size, index_length});
context.template Alloc<T>(output);
paddle::framework::TensorFromVector(res, context, output);
output->Resize(ddim);
}
template <typename T, typename Context>
void IndexSampleKernel(const Context &ctx,
const DenseTensor &x,
const DenseTensor &index,
DenseTensor *out) {
ctx.template Alloc<T>(out);
auto index_type = index.dtype();
bool index_type_match =
index_type == DataType::INT32 || index_type == DataType::INT64;
PADDLE_ENFORCE_EQ(
index_type_match,
true,
errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(index_type)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT32)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType((DataType::INT64)))));
if (index_type == DataType::INT32) {
IndexSampleInner<T, Context, int>(ctx, x, index, out);
} else if (index_type == DataType::INT64) {
IndexSampleInner<T, Context, int64_t>(ctx, x, index, out);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_sample,
CPU,
ALL_LAYOUT,
phi::IndexSampleKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/logical_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/elementwise.h"
#include "paddle/phi/kernels/funcs/logical_functor.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/transform.h"
namespace phi {
#define DEFINE_LOGICAL_BINARY_KERNEL(type) \
template <typename T, typename Context> \
void Logical##type##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out) { \
funcs::Logical##type##Functor<T> binary_func; \
ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
dev_ctx, x, y, -1, binary_func, out); \
}
DEFINE_LOGICAL_BINARY_KERNEL(And)
DEFINE_LOGICAL_BINARY_KERNEL(Or)
DEFINE_LOGICAL_BINARY_KERNEL(Xor)
#undef DEFINE_LOGICAL_BINARY_KERNEL
template <typename T, typename Context>
void LogicalNotKernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out) {
auto* out_ptr = dev_ctx.template Alloc<bool>(out);
funcs::LogicalNotFunctor<T> unary_func;
paddle::platform::Transform<Context> trans;
trans(dev_ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func);
}
} // namespace phi
#define REGISTER_LOGICAL_CPU_KERNEL(logical_and, func_type) \
PD_REGISTER_KERNEL(logical_and, \
CPU, \
ALL_LAYOUT, \
phi::Logical##func_type##Kernel, \
float, \
double, \
bool, \
int64_t, \
int, \
int8_t, \
int16_t) {}
REGISTER_LOGICAL_CPU_KERNEL(logical_and, And)
REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or)
REGISTER_LOGICAL_CPU_KERNEL(logical_not, Not)
REGISTER_LOGICAL_CPU_KERNEL(logical_xor, Xor)
...@@ -28,20 +28,6 @@ void SplitKernel(const Context& dev_ctx, ...@@ -28,20 +28,6 @@ void SplitKernel(const Context& dev_ctx,
const ScalarArray& num_or_sections, const ScalarArray& num_or_sections,
const Scalar& axis_scalar, const Scalar& axis_scalar,
std::vector<DenseTensor*> outs) { std::vector<DenseTensor*> outs) {
// need to infershape output
if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
std::vector<MetaTensor> out_metas;
for (size_t i = 0; i < outs.size(); ++i) {
out_metas.push_back(outs[i]);
}
phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
for (size_t i = 0; i < out_metas.size(); ++i) {
outs[i]->Resize(out_metas[i].dims());
}
}
std::vector<const DenseTensor*> shape_refer; std::vector<const DenseTensor*> shape_refer;
for (size_t j = 0; j < outs.size(); ++j) { for (size_t j = 0; j < outs.size(); ++j) {
dev_ctx.template Alloc<T>(outs[j]); dev_ctx.template Alloc<T>(outs[j]);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/uniform_random_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T>
inline void UniformRealDistribution(T *data,
const int64_t &size,
const float &min,
const float &max,
std::shared_ptr<std::mt19937_64> engine) {
std::uniform_real_distribution<T> dist(static_cast<T>(min),
static_cast<T>(max));
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(*engine);
}
}
template <>
inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
const int64_t &size,
const float &min,
const float &max,
std::shared_ptr<std::mt19937_64> engine) {
std::uniform_real_distribution<float> dist(min, max);
for (int64_t i = 0; i < size; ++i) {
data[i] = static_cast<phi::dtype::bfloat16>(dist(*engine));
}
}
template <typename T, typename Context>
void UniformRandomRawKernel(const Context &dev_ctx,
const ScalarArray &shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
DenseTensor *out) {
out->Resize(phi::make_ddim(shape.GetData()));
VLOG(4) << out->dims();
T *data = dev_ctx.template Alloc<T>(out);
auto size = out->numel();
std::shared_ptr<std::mt19937_64> engine;
if (seed) {
engine = std::make_shared<std::mt19937_64>();
engine->seed(seed);
} else {
engine = dev_ctx.GetGenerator()->GetCPUEngine();
}
UniformRealDistribution<T>(data, size, min, max, engine);
if (diag_num > 0) {
PADDLE_ENFORCE_GT(
size,
(diag_num - 1) * (diag_step + 1),
phi::errors::InvalidArgument(
"ShapeInvalid: the diagonal's elements is equal (num-1) "
"* (step-1) with num %d, step %d,"
"It should be smaller than %d, but received %d",
diag_num,
diag_step,
(diag_num - 1) * (diag_step + 1),
size));
for (int64_t i = 0; i < diag_num; ++i) {
int64_t pos = i * diag_step + i;
data[pos] = diag_val;
}
}
}
template <typename T, typename Context>
void UniformRandomKernel(const Context &dev_ctx,
const ScalarArray &shape,
DataType dtype,
float min,
float max,
int seed,
DenseTensor *out) {
UniformRandomRawKernel<T>(
dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
}
} // namespace phi
PD_REGISTER_KERNEL(uniform_random_raw,
CPU,
ALL_LAYOUT,
phi::UniformRandomRawKernel,
float,
double,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(uniform_random,
CPU,
ALL_LAYOUT,
phi::UniformRandomKernel,
float,
double,
phi::dtype::bfloat16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/where_grad_kernel.h"
namespace phi {
template <typename T, typename Context>
void WhereGradKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
DenseTensor* x_grad,
DenseTensor* y_grad) {
const auto* cond_data = condition.data<bool>();
auto numel = condition.numel();
auto* dout = out_grad.data<T>();
if (x_grad != nullptr) {
auto* dx = ctx.template Alloc<T>(x_grad);
for (int i = 0; i < numel; i++) {
dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
}
}
if (y_grad != nullptr) {
auto* dy = ctx.template Alloc<T>(y_grad);
for (int i = 0; i < numel; i++) {
dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(where_grad,
CPU,
ALL_LAYOUT,
phi::WhereGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/where_kernel.h"
namespace phi {
template <typename T, typename Context>
void WhereKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
const bool* cond_data = condition.data<bool>();
const T* x_data = x.data<T>();
const T* y_data = y.data<T>();
auto x_numel = x.numel();
T* out_data = ctx.template Alloc<T>(out);
for (int i = 0; i < x_numel; i++) {
out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
}
}
} // namespace phi
PD_REGISTER_KERNEL(
where, CPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.1 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.1
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/core/hostdevice.h"
namespace phi {
// Aligned vector generates vectorized load/store on CUDA.
template <typename T, int Size>
struct alignas(sizeof(T) * Size) AlignedVector {
T val[Size];
HOSTDEVICE inline const T& operator[](int i) const { return val[i]; }
HOSTDEVICE inline T& operator[](int i) { return val[i]; }
};
template <typename T, int Size>
HOSTDEVICE inline void Load(const T* addr, AlignedVector<T, Size>* vec) {
const AlignedVector<T, Size>* addr_vec =
reinterpret_cast<const AlignedVector<T, Size>*>(addr);
*vec = *addr_vec;
}
template <typename T, int Size>
HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
AlignedVector<T, Size>* addr_vec =
reinterpret_cast<AlignedVector<T, Size>*>(addr);
*addr_vec = vec;
}
/*
* Only the address of input data is the multiplier of 1,2,4, vectorized load
* with corresponding multiplier-value is possible. Moreover, the maximum length
* of vectorized load is 128 bits once. Hence, valid length of vectorized load
* shall be determined under both former constraints.
*/
template <typename T>
int GetVectorizedSize(const T* pointer) {
constexpr int max_load_bits = 128;
int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
uint64_t address = reinterpret_cast<uint64_t>(pointer);
constexpr int vec8 = std::alignment_of<AlignedVector<T, 8>>::value; // NOLINT
constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value; // NOLINT
constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value; // NOLINT
if (address % vec8 == 0) {
/*
* Currently, decide to deal with no more than 4 data once while adopting
* vectorization load/store, if performance test shows that dealing with
* 8 data once in vectorization load/store does get optimized, return code
* below can be changed into " return std::min(8, valid_vec_size); " .
*/
return std::min(4, valid_vec_size);
} else if (address % vec4 == 0) {
return std::min(4, valid_vec_size);
} else if (address % vec2 == 0) {
return std::min(2, valid_vec_size);
} else {
return 1;
}
}
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef __NVCC__
#include <curand_kernel.h>
#endif
#ifdef __HIPCC__
#include <hiprand_kernel.h>
#endif
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/generator.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
#endif
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition)
#endif
namespace phi {
namespace distribution {
/********************* Transformation Function **********************/
template <typename T>
struct exponential_transform {
explicit exponential_transform(T lambda) : lambda_(lambda) {}
HOSTDEVICE inline T operator()(T val) const {
#if defined(__NVCC__) || defined(__HIPCC__)
if (std::is_same<T, double>::value) {
return static_cast<T>(-1.0) / lambda_ * log(val);
} else {
return static_cast<T>(-1.0) / lambda_ * __logf(val);
}
#else
return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
#endif
}
private:
T lambda_;
};
template <typename T>
struct uniform_transform {
explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
HOSTDEVICE inline T operator()(T val) const {
if (UNLIKELY(val == static_cast<T>(1.0))) {
return min_;
} else {
return val * range_ + min_;
}
}
private:
T range_;
T min_;
};
template <typename T>
struct normal_transform {
explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
private:
T mean_;
T std_;
};
#if defined(__NVCC__) || defined(__HIPCC__)
namespace kps = phi::kps;
/*********************** Distribution Function *************************/
template <typename T>
struct uniform_distribution;
template <typename T>
struct normal_distribution;
#if defined(__NVCC__)
template <>
struct uniform_distribution<float> {
__device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
return curand_uniform4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct uniform_distribution<double> {
__device__ inline double2 operator()(
curandStatePhilox4_32_10_t *state) const {
return curand_uniform2_double(state);
}
static constexpr int kReturnsCount = 2;
};
template <>
struct normal_distribution<float> {
__device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
return curand_normal4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct normal_distribution<double> {
__device__ inline double2 operator()(
curandStatePhilox4_32_10_t *state) const {
return curand_normal2_double(state);
}
static constexpr int kReturnsCount = 2;
};
#else
template <>
struct uniform_distribution<float> {
__device__ inline float4 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_uniform4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct uniform_distribution<double> {
__device__ inline double2 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_uniform2_double(state);
}
static constexpr int kReturnsCount = 2;
};
template <>
struct normal_distribution<float> {
__device__ inline float4 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_normal4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct normal_distribution<double> {
__device__ inline double2 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_normal2_double(state);
}
static constexpr int kReturnsCount = 2;
};
#endif
/******** Launch GPU function of distribution and transformation *********/
template <typename T, typename DistOp, typename TransformOp>
__global__ void DistributionKernel(size_t size,
uint64_t seed,
uint64_t offset,
DistOp dist,
TransformOp trans,
T *out_data,
size_t stride) {
size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
static constexpr int kCount = DistOp::kReturnsCount;
#if defined(__NVCC__)
curandStatePhilox4_32_10_t state;
curand_init(seed, idx + THREAD_ID_X, offset, &state);
using SType = curandStatePhilox4_32_10_t;
#else
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
using SType = hiprandStatePhilox4_32_10_t;
#endif
size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
T args[kCount];
T result[kCount];
for (size_t i = idx; i < size; i += total_thread * kCount) {
kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(
&result[0], &args[0], trans);
kps::WriteData<T, T, kCount, 1, 1, true>(
out_data + i, &result[0], size - i, 1, stride, 1);
__syncthreads();
}
}
template <typename T, typename DistOp, typename TransformOp>
void distribution_and_transform(const GPUContext &dev_ctx,
DenseTensor *out,
DistOp dist,
TransformOp trans) {
T *out_data = dev_ctx.template Alloc<T>(out);
auto size = out->numel();
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
auto gen_cuda = dev_ctx.GetGenerator();
size_t block_size = 256;
size_t expect_grid_size = (size + block_size - 1) / block_size;
const auto &prop = backends::gpu::GetDeviceProperties(device_id);
size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
prop.multiProcessorCount;
size_t grid_size =
expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
size_t total_thread = block_size * grid_size;
size_t curand4_loop_times =
(size + 4 * total_thread - 1) / (4 * total_thread);
// 'increment' shoulde be multiple of 4
uint64_t increment = curand4_loop_times * 4;
auto seed_offset = gen_cuda->IncrementOffset(increment);
uint64_t seed = seed_offset.first;
uint64_t offset = seed_offset.second;
DistributionKernel<
T,
DistOp,
TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
size, seed, offset, dist, trans, out_data, total_thread);
}
#endif
} // namespace distribution
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
namespace phi {
template <typename T, typename Functor, int VecSize>
__global__ void VectorizedIndexKernel(T *out,
size_t numel,
size_t main_offset,
Functor func) {
size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
size_t args[VecSize];
T result[VecSize];
for (; data_offset < main_offset; data_offset += stride) {
kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
&result[0], &args[0], func);
kps::WriteData<T, VecSize, 1, 1, false>(
out + data_offset, &result[0], BLOCK_NUM_X * VecSize);
}
size_t num = numel - data_offset;
if (num > 0) {
kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
&result[0], &args[0], func);
kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
}
}
template <typename T, typename Functor>
void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) {
int numel = out->numel();
T *out_data = dev_ctx.template Alloc<T>(out);
if (numel <= 0) return;
int vec_size = phi::GetVectorizedSize(out_data);
#ifdef PADDLE_WITH_XPU_KP
int block = 64;
int grid = 8;
auto stream = dev_ctx.x_context()->xpu_stream;
#else
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
int grid = config.block_per_grid.x;
int block = config.thread_per_block.x;
auto stream = dev_ctx.stream();
#endif
size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
switch (vec_size) {
case 4:
VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
out_data, numel, main_offset, func);
break;
case 2:
VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
out_data, numel, main_offset, func);
break;
case 1:
VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
out_data, numel, main_offset, func);
break;
default: {
PADDLE_THROW(phi::errors::Unimplemented(
"Unsupported vectorized size: %d !", vec_size));
break;
}
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace phi {
namespace funcs {
#define LOGICAL_BINARY_FUNCTOR(func_name, op) \
template <typename T> \
struct func_name { \
using ELEMENT_TYPE = T; \
HOSTDEVICE bool operator()(const T a, const T b) const { \
return static_cast<bool>(a) op static_cast<bool>(b); \
} \
};
LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
#undef LOGICAL_BINARY_FUNCTOR
template <typename T>
struct LogicalNotFunctor {
using ELEMENT_TYPE = T;
HOSTDEVICE bool operator()(const T a) const { return !a; }
};
} // namespace funcs
} // namespace phi
...@@ -12,11 +12,10 @@ ...@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/atan2_grad_kernel.h"
#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
PD_REGISTER_KERNEL(atan2_grad, PD_REGISTER_KERNEL(atan2_grad,
GPU, GPU,
......
...@@ -12,11 +12,10 @@ ...@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/atan2_kernel.h"
#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
PD_REGISTER_KERNEL(atan2, PD_REGISTER_KERNEL(atan2,
GPU, GPU,
......
...@@ -80,8 +80,4 @@ void CastKernel(const Context& dev_ctx, ...@@ -80,8 +80,4 @@ void CastKernel(const Context& dev_ctx,
paddle::experimental::DataType::UNDEFINED); \ paddle::experimental::DataType::UNDEFINED); \
} }
#if !defined(PADDLE_WITH_HIP)
PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, phi::dtype::bfloat16) PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, phi::dtype::bfloat16)
#else
PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_sample_grad_kernel.h"
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
namespace {
template <typename Context>
void LimitGridDim(const Context& ctx, dim3* grid_dim) {
auto max_grid_dim =
reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
}
#define PREDEFINED_BLOCK_SIZE_X 512
#define PREDEFINED_BLOCK_SIZE 1024
#define MIN(a, b) ((a) < (b) ? (a) : (b))
};
template <typename T, typename IndexT = int>
__global__ void IndexSampleGrad(const IndexT* index,
T* in_grad,
const T* out_grad,
size_t index_length,
size_t input_length,
size_t batch_size,
bool same_data_in_row = true) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
if (same_data_in_row) {
paddle::platform::CudaAtomicAdd(
&(in_grad[in_idx - index_i + sample_idx]), out_grad[sample_idx]);
} else {
in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
}
}
}
}
template <typename T, typename Context>
void IndexSampleGradKernel(const Context& ctx,
const DenseTensor& out_grad,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* x_grad) {
const T* output_grad_data = out_grad.data<T>();
T* input_grad_data = ctx.template Alloc<T>(x_grad);
auto index_type = index.dtype();
bool index_type_match =
index_type == DataType::INT32 || index_type == DataType::INT64;
PADDLE_ENFORCE_EQ(
index_type_match,
true,
errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(index_type)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT32)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType((DataType::INT64)))));
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
auto input_num = x.numel();
auto input_dim = x.dims();
auto index_dim = index.dims();
size_t batch_size = index_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
bool same_data_in_index_row = index_length == 1 ? false : true;
auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
auto block_height =
paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(ctx, x_grad, static_cast<T>(0));
if (index_type == DataType::INT64) {
const int64_t* index_data = index.data<int64_t>();
IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data,
input_grad_data,
output_grad_data,
index_length,
input_length,
batch_size,
same_data_in_index_row);
} else if (index_type == DataType::INT32) {
const int* index_data = index.data<int>();
IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data,
input_grad_data,
output_grad_data,
index_length,
input_length,
batch_size,
same_data_in_index_row);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_sample_grad,
GPU,
ALL_LAYOUT,
phi::IndexSampleGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_sample_kernel.h"
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
namespace {
template <typename Context>
void LimitGridDim(const Context& ctx, dim3* grid_dim) {
auto max_grid_dim =
reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
}
#define PREDEFINED_BLOCK_SIZE_X 512
#define PREDEFINED_BLOCK_SIZE 1024
#define MIN(a, b) ((a) < (b) ? (a) : (b))
}
template <typename T, typename IndexT = int>
__global__ void IndexSampleForward(const IndexT* index,
const T* in_data,
T* out_data,
size_t index_length,
size_t input_length,
size_t batch_size) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
}
}
}
template <typename T, typename Context>
void IndexSampleKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* out) {
auto index_type = index.dtype();
bool index_type_match =
index_type == DataType::INT32 || index_type == DataType::INT64;
PADDLE_ENFORCE_EQ(
index_type_match,
true,
errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(index_type)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT32)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType((DataType::INT64)))));
const T* in_data = x.data<T>();
T* out_data = ctx.template Alloc<T>(out);
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
auto input_dim = x.dims();
auto index_dim = index.dims();
size_t batch_size = input_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
int block_height =
paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
if (index_type == DataType::INT64) {
const int64_t* index_data = index.data<int64_t>();
IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length, batch_size);
} else if (index_type == DataType::INT32) {
const int* index_data = index.data<int>();
IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length, batch_size);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_sample,
GPU,
ALL_LAYOUT,
phi::IndexSampleKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/logical_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/logical_functor.h"
#include "paddle/phi/kernels/gpu/elementwise.h"
namespace phi {
#define DEFINE_LOGICAL_BINARY_KERNEL(type) \
template <typename T, typename Context> \
void Logical##type##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out) { \
using InT = typename funcs::Logical##type##Functor<T>::ELEMENT_TYPE; \
using OutT = bool; \
dev_ctx.template Alloc<bool>(out); \
funcs::Logical##type##Functor<T> binary_func; \
std::vector<const DenseTensor*> ins = {&x, &y}; \
std::vector<DenseTensor*> outs = {out}; \
funcs::BroadcastKernel<ElementwiseType::kBinary, InT, OutT>( \
dev_ctx, ins, &outs, -1, binary_func); \
}
DEFINE_LOGICAL_BINARY_KERNEL(And)
DEFINE_LOGICAL_BINARY_KERNEL(Or)
DEFINE_LOGICAL_BINARY_KERNEL(Xor)
#undef DEFINE_LOGICAL_BINARY_KERNEL
template <typename T, typename Context>
void LogicalNotKernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out) {
using InT = typename funcs::LogicalNotFunctor<T>::ELEMENT_TYPE;
using OutT = bool;
dev_ctx.template Alloc<bool>(out);
funcs::LogicalNotFunctor<T> unary_func;
std::vector<const DenseTensor*> ins = {&x};
std::vector<DenseTensor*> outs = {out};
funcs::BroadcastKernel<ElementwiseType::kUnary, InT, OutT>(
dev_ctx, ins, &outs, -1, unary_func);
}
} // namespace phi
#define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \
PD_REGISTER_KERNEL(logical_and, \
GPU, \
ALL_LAYOUT, \
phi::Logical##func_type##Kernel, \
float, \
double, \
bool, \
int64_t, \
int, \
int8_t, \
int16_t) {}
REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And)
REGISTER_LOGICAL_CUDA_KERNEL(logical_or, Or)
REGISTER_LOGICAL_CUDA_KERNEL(logical_not, Not)
REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, Xor)
...@@ -155,6 +155,7 @@ PD_REGISTER_KERNEL(sum_raw, ...@@ -155,6 +155,7 @@ PD_REGISTER_KERNEL(sum_raw,
float, float,
double, double,
float16, float16,
bfloat16,
int16_t, int16_t,
int, int,
int64_t, int64_t,
......
...@@ -70,6 +70,7 @@ PD_REGISTER_KERNEL(scale, ...@@ -70,6 +70,7 @@ PD_REGISTER_KERNEL(scale,
float, float,
double, double,
phi::dtype::float16, phi::dtype::float16,
phi::dtype::bfloat16,
uint8_t, uint8_t,
int8_t, int8_t,
int16_t, int16_t,
......
...@@ -27,20 +27,6 @@ void SplitKernel(const Context& dev_ctx, ...@@ -27,20 +27,6 @@ void SplitKernel(const Context& dev_ctx,
const ScalarArray& num_or_sections, const ScalarArray& num_or_sections,
const Scalar& axis_scalar, const Scalar& axis_scalar,
std::vector<DenseTensor*> outs) { std::vector<DenseTensor*> outs) {
// need to infershape output
if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
std::vector<MetaTensor> out_metas;
for (size_t i = 0; i < outs.size(); ++i) {
out_metas.push_back(outs[i]);
}
phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
for (size_t i = 0; i < out_metas.size(); ++i) {
outs[i]->Resize(out_metas[i].dims());
}
}
std::vector<const DenseTensor*> shape_refer; std::vector<const DenseTensor*> shape_refer;
for (size_t j = 0; j < outs.size(); ++j) { for (size_t j = 0; j < outs.size(); ++j) {
dev_ctx.template Alloc<T>(outs[j]); dev_ctx.template Alloc<T>(outs[j]);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/uniform_random_kernel.h"
#include "gflags/gflags.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
DECLARE_bool(use_curand);
namespace phi {
template <typename T>
struct UniformGenerator {
T min_, max_;
unsigned int seed_;
T diag_val_;
unsigned int diag_num_;
unsigned int diag_step_;
__host__ __device__ UniformGenerator(
T min, T max, int seed, int diag_num, int diag_step, T diag_val)
: min_(min),
max_(max),
seed_(seed),
diag_num_(diag_num),
diag_step_(diag_step),
diag_val_(diag_val) {}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed_);
thrust::uniform_real_distribution<T> dist(min_, max_);
rng.discard(n);
T out = dist(rng);
unsigned int remainder = n % (diag_step_ + 1);
if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
out = diag_val_;
}
return out;
}
};
template <typename T>
struct UniformGeneratorOffset {
T min_, max_;
unsigned int seed_;
T diag_val_;
unsigned int diag_num_;
unsigned int diag_step_;
int offset_;
__host__ __device__ UniformGeneratorOffset(T min,
T max,
int seed,
int diag_num,
int diag_step,
T diag_val,
int offset)
: min_(min),
max_(max),
seed_(seed),
diag_num_(diag_num),
diag_step_(diag_step),
diag_val_(diag_val),
offset_(offset) {}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed_);
thrust::uniform_real_distribution<T> dist(min_, max_);
rng.discard(n + offset_);
T out = dist(rng);
unsigned int remainder = n % (diag_step_ + 1);
if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
out = diag_val_;
}
return out;
}
};
template <typename T, typename Context>
void UniformRandomRawKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
DenseTensor* out) {
out->Resize(phi::make_ddim(shape.GetData()));
T* data = dev_ctx.template Alloc<T>(out);
auto size = out->numel();
bool seed_flag = false;
if (seed == 0) {
std::random_device rd;
seed = rd();
seed_flag = true;
}
auto generator = dev_ctx.GetGenerator();
if (generator->GetIsInitPy() && seed_flag) {
if (FLAGS_use_curand) {
using MT = typename kps::details::MPTypeTrait<T>::Type;
distribution::uniform_distribution<MT> dist;
distribution::uniform_transform<MT> trans(min, max);
distribution::distribution_and_transform<T>(dev_ctx, out, dist, trans);
} else {
auto seed_offset = generator->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
auto func = UniformGeneratorOffset<T>(min,
max,
seed_offset.first,
diag_num,
diag_step,
diag_val,
gen_offset);
IndexKernel<T, UniformGeneratorOffset<T>>(dev_ctx, out, func);
}
} else {
auto func =
UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
IndexKernel<T, UniformGenerator<T>>(dev_ctx, out, func);
}
}
template <typename T, typename Context>
void UniformRandomKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
DenseTensor* out) {
UniformRandomRawKernel<T>(
dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
}
} // namespace phi
PD_REGISTER_KERNEL(uniform_random_raw,
GPU,
ALL_LAYOUT,
phi::UniformRandomRawKernel,
float,
double) {}
PD_REGISTER_KERNEL(
uniform_random, GPU, ALL_LAYOUT, phi::UniformRandomKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/where_grad_kernel.h"
namespace phi {
template <typename T>
__global__ void WhereGradCUDAKernel(
const int N, const T* dout, const bool* cond, T* dx, T* dy) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < N; idx += blockDim.x * gridDim.x) {
if (dx != nullptr) {
dx[idx] = cond[idx] ? dout[idx] : 0.;
}
if (dy != nullptr) {
dy[idx] = cond[idx] ? 0. : dout[idx];
}
}
}
template <typename T, typename Context>
void WhereGradKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
DenseTensor* x_grad,
DenseTensor* y_grad) {
const bool* cond_data = condition.data<bool>();
auto numel = condition.numel();
auto* dout = out_grad.data<T>();
T* dx = (x_grad != nullptr) ? ctx.template Alloc<T>(x_grad) : nullptr;
T* dy = (y_grad != nullptr) ? ctx.template Alloc<T>(y_grad) : nullptr;
auto stream = ctx.stream();
auto config = backends::gpu::GetGpuLaunchConfig1D(ctx, numel);
WhereGradCUDAKernel<
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
numel, dout, cond_data, dx, dy);
}
} // namespace phi
PD_REGISTER_KERNEL(where_grad,
GPU,
ALL_LAYOUT,
phi::WhereGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/where_kernel.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
namespace phi {
// Cond
template <typename T>
struct CondFunctor {
inline HOSTDEVICE T operator()(const bool cond, const T x, const T y) const {
return cond ? x : y;
}
};
template <typename T, typename Context>
void WhereKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
std::vector<const DenseTensor*> ins = {&condition, &x, &y};
std::vector<DenseTensor*> outs = {out};
ctx.template Alloc<T>(out);
CondFunctor<T> func;
funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
ctx, ins, &outs, -1, func);
}
} // namespace phi
PD_REGISTER_KERNEL(
where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
...@@ -14,9 +14,10 @@ ...@@ -14,9 +14,10 @@
#pragma once #pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/atan2_grad_kernel.h" #include "paddle/phi/kernels/atan2_grad_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi { namespace phi {
......
...@@ -14,9 +14,10 @@ ...@@ -14,9 +14,10 @@
#pragma once #pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/atan2_kernel.h" #include "paddle/phi/kernels/atan2_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi { namespace phi {
template <typename T> template <typename T>
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void IndexSampleGradKernel(const Context& ctx,
const DenseTensor& out_grad,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* in_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void IndexSampleKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* out);
} // namespace phi
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
#define DECLEAR_LOGICAL_BINARY_KERNEL(type) \
template <typename T, typename Context> \
void Logical##type##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out);
DECLEAR_LOGICAL_BINARY_KERNEL(And)
DECLEAR_LOGICAL_BINARY_KERNEL(Or)
DECLEAR_LOGICAL_BINARY_KERNEL(Xor)
#undef DECLEAR_LOGICAL_BINARY_KERNEL
template <typename T, typename Context>
void LogicalNotKernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out);
} // namespace phi
...@@ -165,6 +165,7 @@ PD_REGISTER_KERNEL(sum, ...@@ -165,6 +165,7 @@ PD_REGISTER_KERNEL(sum,
float, float,
double, double,
phi::dtype::float16, phi::dtype::float16,
phi::dtype::bfloat16,
int16_t, int16_t,
int, int,
int64_t, int64_t,
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/uniform_random_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void UniformRandomRawSRKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
SelectedRows* out) {
phi::UniformRandomRawKernel<T>(dev_ctx,
shape,
dtype,
min,
max,
seed,
diag_num,
diag_step,
diag_val,
out->mutable_value());
}
template <typename T, typename Context>
void UniformRandomSRKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
SelectedRows* out) {
phi::UniformRandomKernel<T>(
dev_ctx, shape, dtype, min, max, seed, out->mutable_value());
}
} // namespace phi
PD_REGISTER_KERNEL(uniform_random_raw_sr,
CPU,
ALL_LAYOUT,
phi::UniformRandomRawSRKernel,
float,
double,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(uniform_random_sr,
CPU,
ALL_LAYOUT,
phi::UniformRandomSRKernel,
float,
double,
phi::dtype::bfloat16) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(uniform_random_raw_sr,
GPU,
ALL_LAYOUT,
phi::UniformRandomRawSRKernel,
float,
double) {}
PD_REGISTER_KERNEL(uniform_random_sr,
GPU,
ALL_LAYOUT,
phi::UniformRandomSRKernel,
float,
double) {}
#endif
...@@ -43,18 +43,18 @@ std::vector<DenseTensor> Split(const Context& dev_ctx, ...@@ -43,18 +43,18 @@ std::vector<DenseTensor> Split(const Context& dev_ctx,
} }
std::vector<MetaTensor> out_meta; std::vector<MetaTensor> out_meta;
std::vector<MetaTensor*> out_meta_ptr;
out_meta.reserve(out_number); out_meta.reserve(out_number);
out_meta_ptr.reserve(out_number);
std::vector<DenseTensor> result; std::vector<DenseTensor> result;
result.reserve(out_number); result.reserve(out_number);
for (size_t i = 0; i < out_number; ++i) { for (size_t i = 0; i < out_number; ++i) {
auto dense_out = phi::Empty<T, Context>(dev_ctx); result.emplace_back(phi::Empty<T, Context>(dev_ctx));
MetaTensor tmp_meta(&dense_out); out_meta.emplace_back(&result.back());
out_meta_ptr.push_back(&out_meta.back());
result.push_back(dense_out);
out_meta.push_back(&result.back());
} }
SplitInferMeta(x, num_or_sections, axis, &out_meta); SplitInferMeta(x, num_or_sections, axis, out_meta_ptr);
std::vector<DenseTensor*> outs; std::vector<DenseTensor*> outs;
outs.reserve(out_meta.size()); outs.reserve(out_meta.size());
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/selected_rows.h"
namespace phi {
template <typename T, typename Context>
void UniformRandomRawKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
DenseTensor* out);
template <typename T, typename Context>
void UniformRandomKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
DenseTensor* out);
template <typename T, typename Context>
void UniformRandomRawSRKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
SelectedRows* out);
template <typename T, typename Context>
void UniformRandomSRKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
SelectedRows* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void WhereGradKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
DenseTensor* x_grad,
DenseTensor* y_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void WhereKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out);
} // namespace phi
...@@ -59,7 +59,7 @@ void FullKernel(const Context& dev_ctx, ...@@ -59,7 +59,7 @@ void FullKernel(const Context& dev_ctx,
const Scalar& val, const Scalar& val,
DataType dtype, DataType dtype,
DenseTensor* out) { DenseTensor* out) {
out->ResizeAndAllocate(phi::make_ddim(shape.GetData())); out->Resize(phi::make_ddim(shape.GetData()));
FullValueXPU<T>(dev_ctx, out, val.to<T>()); FullValueXPU<T>(dev_ctx, out, val.to<T>());
} }
...@@ -69,6 +69,7 @@ void FullLikeKernel(const Context& dev_ctx, ...@@ -69,6 +69,7 @@ void FullLikeKernel(const Context& dev_ctx,
const Scalar& val, const Scalar& val,
DataType dtype, DataType dtype,
DenseTensor* out) { DenseTensor* out) {
dev_ctx.template Alloc<T>(out);
auto value = val.to<float>(); auto value = val.to<float>();
using XPUInTDType = typename XPUTypeTrait<T>::Type; using XPUInTDType = typename XPUTypeTrait<T>::Type;
using CommonType = typename std::common_type< using CommonType = typename std::common_type<
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature IndexSampleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("index_sample_grad",
{GradVarName("Out"), "X", "Index"},
{},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(index_sample_grad,
phi::IndexSampleGradOpArgumentMapping);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature UniformRandomOpArgumentMapping(
const ArgumentMappingContext& ctx) {
int diag_num = paddle::any_cast<int>(ctx.Attr("diag_num"));
if (ctx.IsDenseTensorOutput("Out")) {
if (diag_num) {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature("uniform_random_raw",
{},
{"ShapeTensorList",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
} else {
const auto& shape =
paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("uniform_random_raw",
{},
{"ShapeTensor",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
} else {
return KernelSignature("uniform_random_raw",
{},
{"shape",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
}
}
} else {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature(
"uniform_random",
{},
{"ShapeTensorList", "dtype", "min", "max", "seed"},
{"Out"});
} else {
const auto& shape =
paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("uniform_random",
{},
{"ShapeTensor", "dtype", "min", "max", "seed"},
{"Out"});
} else {
return KernelSignature("uniform_random",
{},
{"shape", "dtype", "min", "max", "seed"},
{"Out"});
}
}
}
} else if (ctx.IsSelectedRowsOutput("Out")) {
if (diag_num) {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature("uniform_random_raw_sr",
{},
{"ShapeTensorList",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
} else {
const auto& shape =
paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("uniform_random_raw_sr",
{},
{"ShapeTensor",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
} else {
return KernelSignature("uniform_random_raw_sr",
{},
{"shape",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
}
}
} else {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature(
"uniform_random_sr",
{},
{"ShapeTensorList", "dtype", "min", "max", "seed"},
{"Out"});
} else {
const auto& shape =
paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("uniform_random_sr",
{},
{"ShapeTensor", "dtype", "min", "max", "seed"},
{"Out"});
} else {
return KernelSignature("uniform_random_sr",
{},
{"shape", "dtype", "min", "max", "seed"},
{"Out"});
}
}
}
}
return KernelSignature("unregistered", {}, {}, {});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(uniform_random, phi::UniformRandomOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("where_grad",
{"Condition", "X", "Y", GradVarName("Out")},
{},
{GradVarName("X"), GradVarName("Y")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(where_grad, phi::WhereGradOpArgumentMapping);
...@@ -44,6 +44,9 @@ TEST(Backend, OStream) { ...@@ -44,6 +44,9 @@ TEST(Backend, OStream) {
oss << phi::Backend::GPUDNN; oss << phi::Backend::GPUDNN;
EXPECT_EQ(oss.str(), "GPUDNN"); EXPECT_EQ(oss.str(), "GPUDNN");
oss.str(""); oss.str("");
oss << phi::Backend::KPS;
EXPECT_EQ(oss.str(), "KPS");
oss.str("");
try { try {
oss << phi::Backend::NUM_BACKENDS; oss << phi::Backend::NUM_BACKENDS;
} catch (const std::exception& exception) { } catch (const std::exception& exception) {
...@@ -61,6 +64,7 @@ TEST(Backend, StringToBackend) { ...@@ -61,6 +64,7 @@ TEST(Backend, StringToBackend) {
EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU")); EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN")); EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN")); EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN"));
EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS"));
EXPECT_EQ(static_cast<phi::Backend>( EXPECT_EQ(static_cast<phi::Backend>(
static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1), static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
pexp::StringToBackend("CustomBackend")); pexp::StringToBackend("CustomBackend"));
......
...@@ -146,12 +146,10 @@ TEST(CustomKernel, custom_kernel_dot) { ...@@ -146,12 +146,10 @@ TEST(CustomKernel, custom_kernel_dot) {
custom_fake_dot_kernels.end()); custom_fake_dot_kernels.end());
// 3.before register // 3.before register
auto& kernel_factory_instance = phi::KernelFactory::Instance();
auto& kernels = phi::KernelFactory::Instance().kernels(); auto& kernels = phi::KernelFactory::Instance().kernels();
EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name)); EXPECT_TRUE(kernels.find(op_name) == kernels.end());
// mock fake_dot is supported by phi for HasCompatiblePhiKernel check while // mock fake_dot is supported by phi for check while registering
// registering
auto& fake_dot_kernels = kernels[op_name]; auto& fake_dot_kernels = kernels[op_name];
EXPECT_TRUE(fake_dot_kernels.find( EXPECT_TRUE(fake_dot_kernels.find(
...@@ -196,7 +194,7 @@ TEST(CustomKernel, custom_kernel_dot) { ...@@ -196,7 +194,7 @@ TEST(CustomKernel, custom_kernel_dot) {
fake_dot_kernels.end()); fake_dot_kernels.end());
// 4.kernel select // 4.kernel select
auto kernel = kernel_factory_instance.SelectKernelOrThrowError( auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
op_name, phi::KernelKey(backend, layout, phi::DataType::UINT8)); op_name, phi::KernelKey(backend, layout, phi::DataType::UINT8));
// 5.prepare parameters for kernel // 5.prepare parameters for kernel
......
...@@ -426,9 +426,6 @@ class Quant2Int8MkldnnPass(object): ...@@ -426,9 +426,6 @@ class Quant2Int8MkldnnPass(object):
graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass') graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
graph = self._apply_pass(graph, 'conv_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
graph = self._apply_pass(graph, 'conv_affine_channel_fuse_pass')
graph = self._apply_pass(graph,
'conv_eltwiseadd_affine_channel_fuse_pass')
graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass')
graph = self._apply_pass(graph, graph = self._apply_pass(graph,
'conv_transpose_eltwiseadd_bn_fuse_pass') 'conv_transpose_eltwiseadd_bn_fuse_pass')
......
...@@ -560,13 +560,19 @@ class DataParallel(layers.Layer): ...@@ -560,13 +560,19 @@ class DataParallel(layers.Layer):
strategy=None, strategy=None,
comm_buffer_size=25, comm_buffer_size=25,
last_comm_buffer_size=1, last_comm_buffer_size=1,
find_unused_parameters=False): find_unused_parameters=False,
process_group=None,
gradient_as_buffer_view=False,
static_graph=False):
super(DataParallel, super(DataParallel,
self).__init__(layers.full_name() + "_data_parallel") self).__init__(layers.full_name() + "_data_parallel")
self._layers = layers self._layers = layers
self.find_unused_parameters = find_unused_parameters self.find_unused_parameters = find_unused_parameters
self.grad_need_sync = True self.grad_need_sync = True
self.process_group = process_group
self.gradient_as_buffer_view = gradient_as_buffer_view
self.static_graph = static_graph
# NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy.
# It just stores some environment variables, which can be constructed by # It just stores some environment variables, which can be constructed by
......
...@@ -590,7 +590,7 @@ foreach(TEST_OP ${TEST_OPS}) ...@@ -590,7 +590,7 @@ foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP}) py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP) endforeach(TEST_OP)
py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4) py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL) if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL OR APPLE)
py_test_modules(test_warpctc_op MODULES test_warpctc_op) py_test_modules(test_warpctc_op MODULES test_warpctc_op)
set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120) set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
endif() endif()
......
...@@ -144,6 +144,11 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs): ...@@ -144,6 +144,11 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
grad_clip = kwargs.get('grad_clip', None) grad_clip = kwargs.get('grad_clip', None)
clip_after_allreduce = kwargs.get('clip_after_allreduce', True) clip_after_allreduce = kwargs.get('clip_after_allreduce', True)
parameters = [p.name for p in main.all_parameters()]
exclude_fn = lambda var: var.name in parameters[::4]
kwargs['exclude_from_weight_decay_fn'] = exclude_fn
kwargs['lamb_weight_decay'] = 0.1
if use_distributed_lamb: if use_distributed_lamb:
optimizer_class = DistributedFusedLamb optimizer_class = DistributedFusedLamb
kwargs = dict(kwargs) kwargs = dict(kwargs)
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from auto_scan_test import PassAutoScanTest, IgnoreReasons
from program_config import TensorConfig, ProgramConfig, OpConfig
import numpy as np
import paddle.inference as paddle_infer
from functools import partial
from typing import Optional, List, Callable, Dict, Any, Set
import unittest
import hypothesis
from hypothesis import given, settings, seed, example, assume, reproduce_failure
import hypothesis.strategies as st
class TestConvAffineChannelFusePass(PassAutoScanTest):
def is_program_valid(self, program_config: ProgramConfig) -> bool:
return True
def sample_program_config(self, draw):
padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
groups = draw(st.integers(min_value=1, max_value=3))
data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
axis = draw(st.sampled_from([1]))
filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
filter_size = draw(st.integers(min_value=1, max_value=4))
in_channel = groups * filter_channel
out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
out_channel = groups * out_channel_factor
batch_size = draw(st.integers(min_value=1, max_value=4))
dilations = draw(
st.lists(
st.integers(
min_value=1, max_value=2), min_size=2, max_size=2))
paddings = draw(
st.lists(
st.integers(
min_value=0, max_value=2), min_size=2, max_size=2))
strides = draw(
st.lists(
st.integers(
min_value=1, max_value=2), min_size=2, max_size=2))
has_bias = draw(st.booleans())
x_shape = [
batch_size, in_channel, 64, 64
] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
w_shape = [out_channel, filter_channel, filter_size, filter_size]
scale_shape = [out_channel]
bias_shape = [out_channel]
def generate_input():
return np.random.random(x_shape).astype(np.float32)
def generate_weight():
return np.random.random(w_shape).astype(np.float32)
def generate_bias():
return np.random.random(bias_shape).astype(np.float32)
def generate_scale_bias():
return np.random.random(bias_shape).astype(np.float32)
conv2d_op = OpConfig(
"conv2d",
inputs={
"Input": ["input_data"],
"Filter": ["conv2d_weight"],
},
outputs={"Output": ["conv_output"]},
data_format=data_format,
dilations=dilations,
padding_algorithm=padding_algorithm,
groups=groups,
paddings=paddings,
strides=strides,
has_bias=has_bias,
is_test=True)
ac_op = OpConfig(
"affine_channel",
inputs={
"X": ["conv_output"],
"Scale": ["affine_channel_scale"],
"Bias": ["affine_channel_bias"]
},
outputs={"Out": ["affine_channel_ouput"]},
data_layout=data_format)
if has_bias == True:
conv2d_op.inputs["Bias"] = ["conv2d_bias"]
ops = [conv2d_op, ac_op]
program_config = ProgramConfig(
ops=ops,
inputs={
"input_data": TensorConfig(data_gen=partial(generate_input)),
},
weights={
"conv2d_weight":
TensorConfig(data_gen=partial(generate_weight)),
"affine_channel_scale":
TensorConfig(data_gen=partial(generate_scale_bias)),
"affine_channel_bias":
TensorConfig(data_gen=partial(generate_scale_bias)),
},
outputs=["affine_channel_ouput"])
if has_bias == True:
program_config.weights["conv2d_bias"] = TensorConfig(
data_gen=partial(generate_bias))
return program_config
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(use_gpu=True)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
config = self.create_inference_config(use_mkldnn=True)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
def add_ignore_pass_case(self):
# If the problem has been fixed, the judgment
# in is_program_valid needs to be deleted!!!
def teller1(program_config, predictor_config):
if program_config.ops[0].attrs['data_format'] == "NHWC":
return True
return False
# mkldnn Output has diff with bias!
def teller2(program_config, predictor_config):
return predictor_config.mkldnn_enabled() and program_config.ops[
0].attrs['has_bias'] == True
self.add_ignore_check_case(
teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
"The output format of conv2d is wrong when data_format attribute is NHWC, \
because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)."
)
self.add_ignore_check_case(
teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
"Currently mkldnn Output has diff with bias!")
def test(self):
self.run_and_statis(
quant=False,
passes=["conv_affine_channel_fuse_pass"], )
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from auto_scan_test import PassAutoScanTest, IgnoreReasons
from program_config import TensorConfig, ProgramConfig, OpConfig
import numpy as np
import paddle.inference as paddle_infer
from functools import partial
from typing import Optional, List, Callable, Dict, Any, Set
import unittest
import hypothesis
from hypothesis import given, settings, seed, example, assume
import hypothesis.strategies as st
class TestConvEltwiseAddAffineChannelFusePass(PassAutoScanTest):
def is_program_valid(self, program_config: ProgramConfig) -> bool:
attrs = [
program_config.ops[i].attrs
for i in range(len(program_config.ops))
]
if attrs[0]['data_format'] == "NHWC" and attrs[1]['axis'] != 3:
return False
return True
def sample_program_config(self, draw):
padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
groups = draw(st.integers(min_value=1, max_value=3))
data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
axis = draw(st.sampled_from([1]))
filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
filter_size = draw(st.integers(min_value=1, max_value=4))
in_channel = groups * filter_channel
out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
out_channel = groups * out_channel_factor
batch_size = draw(st.integers(min_value=1, max_value=4))
dilations = draw(
st.lists(
st.integers(
min_value=1, max_value=2), min_size=2, max_size=2))
paddings = draw(
st.lists(
st.integers(
min_value=0, max_value=2), min_size=2, max_size=2))
strides = draw(
st.lists(
st.integers(
min_value=1, max_value=2), min_size=2, max_size=2))
has_bias = draw(st.booleans())
x_shape = [
batch_size, in_channel, 64, 64
] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
w_shape = [out_channel, filter_channel, filter_size, filter_size]
scale_shape = [out_channel]
bias_shape = [out_channel]
def generate_input():
return np.random.random(x_shape).astype(np.float32)
def generate_weight():
return np.random.random(w_shape).astype(np.float32)
def generate_bias():
return np.random.random(bias_shape).astype(np.float32)
def generate_scale_bias():
return np.random.random(bias_shape).astype(np.float32)
conv2d_op = OpConfig(
"conv2d",
inputs={
"Input": ["input_data"],
"Filter": ["conv2d_weight"],
},
outputs={"Output": ["conv_output"]},
data_format=data_format,
dilations=dilations,
padding_algorithm=padding_algorithm,
groups=groups,
paddings=paddings,
strides=strides,
has_bias=has_bias,
is_test=True)
eltwise_op = OpConfig(
"elementwise_add",
inputs={"X": ["conv_output"],
"Y": ["conv2d_bias"]},
outputs={"Out": ["elementwise_output"]},
axis=axis)
ac_op = OpConfig(
"affine_channel",
inputs={
"X": ["elementwise_output"],
"Scale": ["affine_channel_scale"],
"Bias": ["affine_channel_bias"]
},
outputs={"Out": ["affine_channel_ouput"]},
data_layout=data_format)
if has_bias == True:
conv2d_op.inputs["Bias"] = ["conv2d_bias"]
ops = [conv2d_op, eltwise_op, ac_op]
program_config = ProgramConfig(
ops=ops,
inputs={
"input_data": TensorConfig(data_gen=partial(generate_input)),
},
weights={
"conv2d_weight":
TensorConfig(data_gen=partial(generate_weight)),
"conv2d_bias": TensorConfig(data_gen=partial(generate_bias)),
"affine_channel_scale":
TensorConfig(data_gen=partial(generate_scale_bias)),
"affine_channel_bias":
TensorConfig(data_gen=partial(generate_scale_bias)),
},
outputs=["affine_channel_ouput"])
return program_config
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(use_gpu=True)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
config = self.create_inference_config(use_mkldnn=True)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
# TRT
config = self.create_trt_inference_config()
config.enable_tensorrt_engine(
workspace_size=1 << 20,
max_batch_size=4,
min_subgraph_size=1,
precision_mode=paddle_infer.PrecisionType.Float32,
use_static=False,
use_calib_mode=False)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
def add_ignore_pass_case(self):
# If the problem has been fixed, the judgment
# in is_program_valid needs to be deleted!!!
def teller1(program_config, predictor_config):
if program_config.ops[0].attrs['data_format'] == "NHWC":
return True
return False
# mkldnn Output has diff with bias!
def teller2(program_config, predictor_config):
return predictor_config.mkldnn_enabled() and program_config.ops[
0].attrs['has_bias'] == True
self.add_ignore_check_case(
teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
"The output format of conv2d is wrong when data_format attribute is NHWC, \
it will trigger Broadcast dimension mismatch bug \
when data_format attribute is NHWC and axis of eltwise op is 1 for this pass."
)
self.add_ignore_check_case(
teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
"Currently mkldnn Output has diff with bias!")
def test(self):
self.run_and_statis(
quant=False,
passes=["conv_eltwiseadd_affine_channel_fuse_pass"], )
if __name__ == "__main__":
unittest.main()
...@@ -482,7 +482,12 @@ class OpTest(unittest.TestCase): ...@@ -482,7 +482,12 @@ class OpTest(unittest.TestCase):
op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
"infer datatype from inputs and outputs for this test case" "infer datatype from inputs and outputs for this test case"
self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) if self.is_bfloat16_op():
self.dtype = np.uint16
self.__class__.dtype = self.dtype
self.output_dtype = np.uint16
else:
self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
inputs = append_input_output(block, op_proto, self.inputs, True, inputs = append_input_output(block, op_proto, self.inputs, True,
self.dtype) self.dtype)
outputs = append_input_output(block, op_proto, self.outputs, False, outputs = append_input_output(block, op_proto, self.outputs, False,
...@@ -1135,7 +1140,7 @@ class OpTest(unittest.TestCase): ...@@ -1135,7 +1140,7 @@ class OpTest(unittest.TestCase):
else: else:
atol = 2 atol = 2
else: else:
atol = 1e-2 atol = 1e-1
if no_check_set is not None: if no_check_set is not None:
if self.op_type not in no_check_set_white_list.no_check_set_white_list: if self.op_type not in no_check_set_white_list.no_check_set_white_list:
......
...@@ -55,7 +55,7 @@ class TestDiffOp(unittest.TestCase): ...@@ -55,7 +55,7 @@ class TestDiffOp(unittest.TestCase):
def test_dygraph(self): def test_dygraph(self):
for place in self.places: for place in self.places:
paddle.disable_static(place) paddle.disable_static()
x = paddle.to_tensor(self.input, place=place) x = paddle.to_tensor(self.input, place=place)
if self.prepend is not None: if self.prepend is not None:
self.prepend = paddle.to_tensor(self.prepend, place=place) self.prepend = paddle.to_tensor(self.prepend, place=place)
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest, convert_float_to_uint16
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.framework import core from paddle.framework import core
...@@ -117,6 +117,39 @@ class TestCase6(TestGatherOp): ...@@ -117,6 +117,39 @@ class TestCase6(TestGatherOp):
self.index_type = "int32" self.index_type = "int32"
class TestGatherBF16Op(OpTest):
def setUp(self):
self.op_type = "gather"
self.dtype = np.uint16
self.config()
xnp = np.random.random(self.x_shape).astype(np.float32)
axis_np = np.array(self.axis).astype(self.axis_type)
index_np = np.array(self.index).astype(self.index_type)
self.inputs = {
'X': convert_float_to_uint16(xnp),
'Index': index_np,
'Axis': axis_np
}
out = gather_numpy(self.inputs['X'], index_np, axis_np[0])
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', numeric_grad_delta=0.5)
def config(self):
"""
For multi-dimension input
"""
self.x_shape = (3, 88, 3)
self.index = [1, 3, 5]
self.index_type = "int32"
self.axis = [1]
self.axis_type = "int32"
class TestGatherOp1(OpTest): class TestGatherOp1(OpTest):
def setUp(self): def setUp(self):
self.op_type = "gather" self.op_type = "gather"
......
...@@ -26,159 +26,149 @@ import paddle.fluid.dygraph as dygraph ...@@ -26,159 +26,149 @@ import paddle.fluid.dygraph as dygraph
from paddle.fluid.dygraph.nn import Linear from paddle.fluid.dygraph.nn import Linear
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.framework import _test_eager_guard
class MLP(fluid.Layer):
def __init__(self, param_attr=None, bias_attr=None):
super(MLP, self).__init__()
self._linear1 = Linear(784, 10)
self._linear2 = Linear(10, 10)
def forward(self, inputs):
y = self._linear1(inputs)
y = self._linear2(y)
return y
class TestDataParallelGroup(unittest.TestCase): class TestDataParallelGroup(unittest.TestCase):
def create_varbase(self, dtype, shape, def create_varbase(self, dtype, shape):
type=core.VarDesc.VarType.LOD_TENSOR): return paddle.rand(shape=shape, dtype=dtype)
return core.VarBase(dtype, shape, "", type, True)
def assign_group_by_size(self, *args):
return core.assign_group_by_size(*args)
def test_construct_group0(self): def test_construct_group0(self):
# one dtype & one limit capability # one dtype & one limit capability
var_list = [] var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append( var_list.append(self.create_varbase("float32", [2, 100]))
self.create_varbase(core.VarDesc.VarType.FP32, [2, 100])) var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) res = self.assign_group_by_size(var_list, [False, False, False, False],
res = core.assign_group_by_size(var_list, [False, False, False, False],
[400]) [400])
self.assertEqual([[0], [1], [2], [3]], res) self.assertEqual([[0], [1], [2], [3]], res)
def test_construct_group1(self): def test_construct_group1(self):
# multi dtype & one limit capability # multi dtype & one limit capability
var_list = [] var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
res = core.assign_group_by_size( res = self.assign_group_by_size(
var_list, [False, False, False, False, False, False], [400]) var_list, [False, False, False, False, False, False], [400])
self.assertEqual([[0, 2], [1, 3], [4], [5]], res) self.assertEqual([[0, 2], [1, 3], [4], [5]], res)
def test_construct_group2(self): def test_construct_group2(self):
# one dtype & multi limit capability # one dtype & multi limit capability
var_list = [] var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) var_list.append(self.create_varbase("float32", [2, 50]))
res = core.assign_group_by_size(var_list, [False, False, False, False], res = self.assign_group_by_size(var_list, [False, False, False, False],
[400, 800]) [400, 800])
self.assertEqual([[0], [1, 2], [3]], res) self.assertEqual([[0], [1, 2], [3]], res)
def test_construct_group3(self): def test_construct_group3(self):
# multi dtype & multi limit capability # multi dtype & multi limit capability
var_list = [] var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
res = core.assign_group_by_size( res = self.assign_group_by_size(
var_list, [False, False, False, False, False, False], [200, 400]) var_list, [False, False, False, False, False, False], [200, 400])
self.assertEqual([[0], [1], [2, 4], [3, 5]], res) self.assertEqual([[0], [1], [2, 4], [3, 5]], res)
def test_construct_group4(self): def test_construct_group4(self):
# multi dtype & zero limit capability # multi dtype & zero limit capability
var_list = [] var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
res = core.assign_group_by_size( res = self.assign_group_by_size(
var_list, [False, False, False, False, False, False], [0]) var_list, [False, False, False, False, False, False], [0])
self.assertEqual([[0], [1], [2], [3], [4], [5]], res) self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
def test_construct_group5(self): def test_construct_group5(self):
# multi dtype & infinite capability # multi dtype & infinite capability
var_list = [] var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
res = core.assign_group_by_size( res = self.assign_group_by_size(
var_list, [False, False, False, False, False, False], [10000]) var_list, [False, False, False, False, False, False], [10000])
self.assertEqual([[0, 2, 4], [1, 3, 5]], res) self.assertEqual([[0, 2, 4], [1, 3, 5]], res)
def test_construct_group6(self): def test_construct_group6(self):
# multi dtype & limit capability & multi tensor type # multi dtype & limit capability & multi tensor type
var_list = [] var_list = []
var_list.append( var_list.append(self.create_varbase(
self.create_varbase(core.VarDesc.VarType.FP32, [1, 50], "float32",
core.VarDesc.VarType.SELECTED_ROWS)) [1, 50], ))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append( var_list.append(self.create_varbase("float64", [1, 25]))
self.create_varbase(core.VarDesc.VarType.FP64, [1, 25], res = self.assign_group_by_size(
core.VarDesc.VarType.SELECTED_ROWS))
res = core.assign_group_by_size(
var_list, [True, False, False, False, False, True], [400]) var_list, [True, False, False, False, False, True], [400])
self.assertEqual([[0], [1, 3], [2, 4], [5]], res) self.assertEqual([[0], [1, 3], [2, 4], [5]], res)
def test_construct_group7(self): def test_construct_group7(self):
# multi dtype & multi limit capability & multi tensor type # multi dtype & multi limit capability & multi tensor type
var_list = [] var_list = []
var_list.append( var_list.append(self.create_varbase("float32", [1, 50]))
self.create_varbase(core.VarDesc.VarType.FP32, [1, 50], var_list.append(self.create_varbase("float64", [1, 25]))
core.VarDesc.VarType.SELECTED_ROWS)) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) res = self.assign_group_by_size(
var_list.append(
self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
core.VarDesc.VarType.SELECTED_ROWS))
res = core.assign_group_by_size(
var_list, [True, False, False, False, False, True], [200, 400]) var_list, [True, False, False, False, False, True], [200, 400])
self.assertEqual([[0], [1], [2], [3], [4], [5]], res) self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
def test_construct_group8(self): def test_construct_group8(self):
# one dtype & one limit capability & have tensor_indices # one dtype & one limit capability & have tensor_indices
var_list = [] var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append( var_list.append(self.create_varbase("float32", [2, 100]))
self.create_varbase(core.VarDesc.VarType.FP32, [2, 100])) var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) res = self.assign_group_by_size(var_list, [False, False, False, False],
res = core.assign_group_by_size(var_list, [False, False, False, False],
[400], [3, 0, 1, 2]) [400], [3, 0, 1, 2])
self.assertEqual([[3, 0], [1], [2]], res) self.assertEqual([[3, 0], [1], [2]], res)
def test_construct_group9(self): def test_construct_group9(self):
# one dtype & one limit capability & have tensor_indices # one dtype & one limit capability & have tensor_indices
var_list = [] var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append( var_list.append(self.create_varbase("float32", [2, 1000]))
self.create_varbase(core.VarDesc.VarType.FP32, [2, 1000])) res = self.assign_group_by_size(var_list, [False, False, False, True],
res = core.assign_group_by_size(var_list, [False, False, False, True],
[300], [1, 0, 2, 3]) [300], [1, 0, 2, 3])
self.assertEqual([[1, 0], [3], [2]], res) self.assertEqual([[1, 0], [3], [2]], res)
class TestDataParallelGroupEager(TestDataParallelGroup):
def create_varbase(self, dtype, shape):
with _test_eager_guard():
return paddle.rand(shape=shape, dtype=dtype)
def assign_group_by_size(self, *args):
return core.eager_assign_group_by_size(*args)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -375,6 +375,53 @@ class TestFP16ScaleBiasLayerNorm(unittest.TestCase): ...@@ -375,6 +375,53 @@ class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
assert_equal(b_g_np_1, b_g_np_2) assert_equal(b_g_np_1, b_g_np_2)
class TestBF16ScaleBiasLayerNorm(unittest.TestCase):
def check_main(self, x_np, weight_np, bias_np, dtype):
paddle.disable_static()
x = paddle.to_tensor(x_np)
weight = paddle.to_tensor(weight_np)
bias = paddle.to_tensor(bias_np)
if dtype == "bfloat16":
x = x.cast(paddle.fluid.core.VarDesc.VarType.BF16)
x.stop_gradient = False
weight.stop_gradient = False
bias.stop_gradient = False
y = F.layer_norm(x, x.shape[1:], weight, bias)
x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
y_np = y.cast('float32').numpy()
x_g_np = x_g.cast('float32').numpy()
w_g_np = w_g.cast('float32').numpy()
b_g_np = b_g.cast('float32').numpy()
paddle.enable_static()
return y_np, x_g_np, w_g_np, b_g_np
def test_main(self):
if (not core.is_compiled_with_cuda()) or (core.cudnn_version() < 8100):
return
x_np = np.random.random([10, 20]).astype('float32')
weight_np = np.random.random([20]).astype('float32')
bias_np = np.random.random([20]).astype('float32')
y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
x_np, weight_np, bias_np, 'float32')
y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
x_np, weight_np, bias_np, 'bfloat16')
def assert_equal(x, y):
self.assertTrue(np.allclose(x, y, atol=1.e-1))
assert_equal(y_np_1, y_np_2)
assert_equal(x_g_np_1, x_g_np_2)
assert_equal(w_g_np_1, w_g_np_2)
assert_equal(b_g_np_1, b_g_np_2)
class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase): class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
def test_main(self): def test_main(self):
self.assertTrue(_keep_layer_norm_scale_bias_to_fp32()) self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest, convert_float_to_uint16
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -282,6 +282,80 @@ class TestPnormOpFP161(TestPnormOpFP16): ...@@ -282,6 +282,80 @@ class TestPnormOpFP161(TestPnormOpFP16):
self.asvector = True self.asvector = True
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestPnormBF16Op(OpTest):
def setUp(self):
self.op_type = "p_norm"
self.init_test_case()
self.x = (np.random.random(self.shape) + 0.5).astype(np.float32)
self.norm = p_norm(self.x, self.axis, self.porder, self.keepdim,
self.asvector)
self.gradient = self.calc_gradient()
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.attrs = {
'epsilon': self.epsilon,
'axis': self.axis,
'keepdim': self.keepdim,
'porder': float(self.porder),
'asvector': self.asvector
}
self.outputs = {'Out': convert_float_to_uint16(self.norm)}
def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-3)
def test_check_grad(self):
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', user_defined_grads=self.gradient)
def init_test_case(self):
self.shape = [2, 3, 4, 5]
self.axis = 1
self.epsilon = 1e-12
self.porder = 2.0
self.keepdim = False
self.dtype = np.uint16
self.asvector = False
def calc_gradient(self):
self.attrs = {
'epsilon': self.epsilon,
'axis': self.axis,
'keepdim': self.keepdim,
'porder': float(self.porder),
'asvector': self.asvector
}
x = self.x
porder = self.attrs["porder"]
axis = self.attrs["axis"]
asvector = self.attrs["asvector"]
x_dtype = x.dtype
x = x.astype(np.float32) if x.dtype == np.float16 else x
if porder == 0:
grad = np.zeros(x.shape).astype(x.dtype)
elif porder in [float("inf"), float("-inf")]:
norm = p_norm(
x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
x_abs = np.abs(x)
grad = np.sign(x)
grad[x_abs != norm] = 0.0
else:
norm = p_norm(
x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
grad = np.power(norm, 1 - porder) * np.power(
np.abs(x), porder - 1) * np.sign(x)
numel = 1
for s in x.shape:
numel *= s
divisor = numel if asvector else x.shape[axis]
numel /= divisor
return [grad.astype(x_dtype) * 1 / numel]
def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False): def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
with fluid.program_guard(fluid.Program()): with fluid.program_guard(fluid.Program()):
data = fluid.data(name="X", shape=shape_x, dtype=dtype) data = fluid.data(name="X", shape=shape_x, dtype=dtype)
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest, skip_check_grad_ci from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
import paddle import paddle
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -61,6 +61,37 @@ class TestSumOp_fp16(OpTest): ...@@ -61,6 +61,37 @@ class TestSumOp_fp16(OpTest):
self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSumOp_bf16(OpTest):
def setUp(self):
np.random.seed(100)
self.op_type = "reduce_sum"
self.dtype = np.uint16
self.x = np.random.uniform(0, 0.1, (2, 5, 10)).astype(np.float32)
self.attrs = {'dim': [0, 1, 2]}
self.out = self.x.sum(axis=tuple(self.attrs['dim']))
self.gradient = self.calc_gradient()
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
self.gradient = self.calc_gradient()
def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place)
def test_check_grad(self):
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', user_defined_grads=self.gradient)
def calc_gradient(self):
x = self.x
grad = np.ones(x.shape, dtype=x.dtype)
return [grad]
class TestSumOp_fp16_withInt(OpTest): class TestSumOp_fp16_withInt(OpTest):
def setUp(self): def setUp(self):
self.op_type = "reduce_sum" self.op_type = "reduce_sum"
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest, convert_float_to_uint16
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -153,6 +153,23 @@ class TestScaleFp16Op(TestScaleOp): ...@@ -153,6 +153,23 @@ class TestScaleFp16Op(TestScaleOp):
place, ["X"], "Out", max_relative_error=0.05) place, ["X"], "Out", max_relative_error=0.05)
class TestScaleBF16Op(OpTest):
def setUp(self):
self.op_type = "scale"
self.dtype = np.uint16
self.attrs = {'scale': -2.3}
x = np.random.random((10, 10)).astype(np.float32)
out = x * np.float32(self.attrs['scale'])
self.inputs = {'X': convert_float_to_uint16(x)}
self.outputs = {'Out': convert_float_to_uint16(out)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', numeric_grad_delta=0.8)
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA") "core is not compiled with CUDA")
class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows): class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
......
...@@ -298,6 +298,32 @@ def create_test_sum_fp16_class(parent): ...@@ -298,6 +298,32 @@ def create_test_sum_fp16_class(parent):
globals()[cls_name] = TestSumFp16Case globals()[cls_name] = TestSumFp16Case
#----------- test bf16 -----------
class TestSumBF16Op(OpTest):
def setUp(self):
self.op_type = "sum"
self.init_kernel_type()
x0 = np.random.random((3, 40)).astype(np.float32)
x1 = np.random.random((3, 40)).astype(np.float32)
x2 = np.random.random((3, 40)).astype(np.float32)
y = x0 + x1 + x2
self.inputs = {
"X": [("x0", convert_float_to_uint16(x0)),
("x1", convert_float_to_uint16(x1)),
("x2", convert_float_to_uint16(x2))]
}
self.outputs = {'Out': convert_float_to_uint16(y)}
def init_kernel_type(self):
self.dtype = np.uint16
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['x0'], 'Out', numeric_grad_delta=0.5)
class API_Test_Add_n(unittest.TestCase): class API_Test_Add_n(unittest.TestCase):
def test_api(self): def test_api(self):
with fluid.program_guard(fluid.Program(), fluid.Program()): with fluid.program_guard(fluid.Program(), fluid.Program()):
......
...@@ -171,10 +171,7 @@ class DistributedFusedLamb(Optimizer): ...@@ -171,10 +171,7 @@ class DistributedFusedLamb(Optimizer):
moment2.is_distributed = True moment2.is_distributed = True
beta1pow = self._create_persistable_var('beta1pow') beta1pow = self._create_persistable_var('beta1pow')
beta2pow = self._create_persistable_var('beta2pow') beta2pow = self._create_persistable_var('beta2pow')
fused_indices = self._create_persistable_var(
'fused_indices', dtype='int32')
weight_decay = self._create_persistable_var('weight_decay')
weight_decay.is_distributed = True
param_info = self._create_persistable_var('param_info', dtype='int32') param_info = self._create_persistable_var('param_info', dtype='int32')
param_info.is_distributed = True param_info.is_distributed = True
...@@ -189,17 +186,20 @@ class DistributedFusedLamb(Optimizer): ...@@ -189,17 +186,20 @@ class DistributedFusedLamb(Optimizer):
'fp16_partial_fused_offsets', dtype='int32') 'fp16_partial_fused_offsets', dtype='int32')
fp16_partial_fused_offsets.is_distributed = True fp16_partial_fused_offsets.is_distributed = True
param_order = self._create_persistable_var('param_order', dtype='int32')
param_order.is_distributed = True
rank = get_rank() rank = get_rank()
nranks = get_world_size() nranks = get_world_size()
scale = self._get_or_create_scale() scale = self._get_or_create_scale()
params = [p for p, _ in params_grads] params = [p for p, _ in params_grads]
grads = [g for _, g in params_grads] grads = [g for _, g in params_grads]
weight_decay_values = [self._weight_decay] * len(params) apply_weight_decay = [1] * len(params)
if self._exclude_from_weight_decay_fn is not None: if self._exclude_from_weight_decay_fn is not None:
for i, p in enumerate(params): for i, p in enumerate(params):
if self._exclude_from_weight_decay_fn(p): if self._exclude_from_weight_decay_fn(p):
weight_decay_values[i] = 0.0 apply_weight_decay[i] = 0
startup_block = self.helper.startup_program.global_block() startup_block = self.helper.startup_program.global_block()
for g in grads: for g in grads:
...@@ -225,8 +225,6 @@ class DistributedFusedLamb(Optimizer): ...@@ -225,8 +225,6 @@ class DistributedFusedLamb(Optimizer):
'Moment2': [moment2], 'Moment2': [moment2],
'Beta1Pow': [beta1pow], 'Beta1Pow': [beta1pow],
'Beta2Pow': [beta2pow], 'Beta2Pow': [beta2pow],
'FusedIndices': [fused_indices],
'WeightDecay': [weight_decay],
'GlobalScale': [scale], 'GlobalScale': [scale],
'ParamInfo': [param_info], 'ParamInfo': [param_info],
'ParamOut': params, 'ParamOut': params,
...@@ -235,12 +233,13 @@ class DistributedFusedLamb(Optimizer): ...@@ -235,12 +233,13 @@ class DistributedFusedLamb(Optimizer):
'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets], 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets], 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
'FusedParamOffsets': [fused_offsets], 'FusedParamOffsets': [fused_offsets],
'ParamOrder': [param_order],
}, },
attrs={ attrs={
'alignment': self._alignment, 'alignment': self._alignment,
'rank': rank, 'rank': rank,
'nranks': nranks, 'nranks': nranks,
'weight_decay': weight_decay_values, 'apply_weight_decay': apply_weight_decay,
'moment1': 0.0, 'moment1': 0.0,
'moment2': 0.0, 'moment2': 0.0,
'beta1': self._beta1, 'beta1': self._beta1,
...@@ -272,8 +271,6 @@ class DistributedFusedLamb(Optimizer): ...@@ -272,8 +271,6 @@ class DistributedFusedLamb(Optimizer):
'Moment2': [moment2], 'Moment2': [moment2],
'Beta1Pow': [beta1pow], 'Beta1Pow': [beta1pow],
'Beta2Pow': [beta2pow], 'Beta2Pow': [beta2pow],
'FusedIndices': [fused_indices],
'WeightDecay': [weight_decay],
'GlobalScale': [scale], 'GlobalScale': [scale],
'ParamInfo': [param_info], 'ParamInfo': [param_info],
'Param': params, 'Param': params,
...@@ -281,6 +278,7 @@ class DistributedFusedLamb(Optimizer): ...@@ -281,6 +278,7 @@ class DistributedFusedLamb(Optimizer):
'FusedParamOffsets': [fused_offsets], 'FusedParamOffsets': [fused_offsets],
'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets], 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets], 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
'ParamOrder': [param_order],
}, },
outputs={ outputs={
'FP32FusedParamOut': [fp32_fused_param], 'FP32FusedParamOut': [fp32_fused_param],
...@@ -294,6 +292,7 @@ class DistributedFusedLamb(Optimizer): ...@@ -294,6 +292,7 @@ class DistributedFusedLamb(Optimizer):
'FoundInf': [self._found_inf], 'FoundInf': [self._found_inf],
}, },
attrs={ attrs={
'weight_decay': self._weight_decay,
'beta1': self._beta1, 'beta1': self._beta1,
'beta2': self._beta2, 'beta2': self._beta2,
'epsilon': self._epsilon, 'epsilon': self._epsilon,
......
...@@ -1667,11 +1667,11 @@ def cross_entropy(input, ...@@ -1667,11 +1667,11 @@ def cross_entropy(input,
label_min = paddle.min(valid_label) label_min = paddle.min(valid_label)
label_max = paddle.max(valid_label) label_max = paddle.max(valid_label)
if label_min < 0: if label_min < 0:
raise ValueError("label should not out of bound, but got{}". raise ValueError("Target {} is out of lower bound.".format(
format(label_min)) label_min.item()))
if label_max >= input.shape[axis]: if label_max >= input.shape[axis]:
raise ValueError("label should not out of bound, but got{}". raise ValueError("Target {} is out of upper bound.".format(
format(label_max)) label_max.item()))
if core.is_compiled_with_npu() or core.is_compiled_with_mlu(): if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
_, _, out = _C_ops.softmax_with_cross_entropy( _, _, out = _C_ops.softmax_with_cross_entropy(
input, label, 'soft_label', soft_label, 'ignore_index', input, label, 'soft_label', soft_label, 'ignore_index',
......
...@@ -451,7 +451,20 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. ...@@ -451,7 +451,20 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
param_code = "" param_code = ""
for param in infer_meta_params: for param in infer_meta_params:
if param in input_names: if param in input_names:
if param in self.optional_vars: if self.inputs['input_info'][param] == "const Tensor&":
param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
elif self.inputs['input_info'][
param] == "const std::vector<Tensor>&":
meta_tensor_code = meta_tensor_code + f"""
{code_indent} auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param});
{code_indent} std::vector<phi::MetaTensor*> {param}_metas({param}_meta_vec.size());
{code_indent} for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{
{code_indent} {param}_metas[i] = &{param}_meta_vec[i];
{code_indent} }}
"""
param_code = param_code + param + "_metas, "
elif param in self.optional_vars:
meta_tensor_code = meta_tensor_code + f""" meta_tensor_code = meta_tensor_code + f"""
{code_indent} paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none); {code_indent} paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none);
{code_indent} auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param}); {code_indent} auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
...@@ -461,7 +474,9 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. ...@@ -461,7 +474,9 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, " param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, "
else: else:
param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), " raise ValueError(
f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported."
)
elif param in kernel_output_names: elif param in kernel_output_names:
meta_tensor_code = meta_tensor_code + code_indent + " phi::MetaTensor " + param.replace( meta_tensor_code = meta_tensor_code + code_indent + " phi::MetaTensor " + param.replace(
'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n" 'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n"
......
...@@ -106,7 +106,7 @@ function prepare_benchmark_environment { ...@@ -106,7 +106,7 @@ function prepare_benchmark_environment {
[ $? -ne 0 ] && LOG "[FATAL] Clone benchmark repo fail." && exit -1 [ $? -ne 0 ] && LOG "[FATAL] Clone benchmark repo fail." && exit -1
LOG "[INFO] Collect api info ..." LOG "[INFO] Collect api info ..."
python benchmark/api/deploy/collect_api_info.py \ python benchmark/api/deploy/collect_api_info.py \
--test_module_name tests_v2 \ --test_module_name dynamic_tests_v2 \
--info_file api_info.txt >& 2 --info_file api_info.txt >& 2
[ $? -ne 0 ] && LOG "[FATAL] Collect api info fail." && exit -1 [ $? -ne 0 ] && LOG "[FATAL] Collect api info fail." && exit -1
[ ! -f benchmark/ci/scripts/op_benchmark.config ] && LOG "[FATAL] Missing op_benchmark.config!" && exit -1 [ ! -f benchmark/ci/scripts/op_benchmark.config ] && LOG "[FATAL] Missing op_benchmark.config!" && exit -1
...@@ -185,7 +185,7 @@ function run_op_benchmark_test { ...@@ -185,7 +185,7 @@ function run_op_benchmark_test {
logs_dir="$(pwd)/logs-${branch_name}" logs_dir="$(pwd)/logs-${branch_name}"
[ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir [ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir
pushd benchmark/api > /dev/null pushd benchmark/api > /dev/null
bash deploy/main_control.sh tests_v2 \ bash deploy/main_control.sh dynamic_tests_v2 \
tests_v2/configs \ tests_v2/configs \
$logs_dir \ $logs_dir \
$VISIBLE_DEVICES \ $VISIBLE_DEVICES \
...@@ -212,7 +212,7 @@ function check_op_benchmark_result { ...@@ -212,7 +212,7 @@ function check_op_benchmark_result {
# there is no need to recompile and install paddle # there is no need to recompile and install paddle
LOG "[INFO] retry ${retry_time} times ..." LOG "[INFO] retry ${retry_time} times ..."
pushd benchmark/api > /dev/null pushd benchmark/api > /dev/null
bash deploy/main_control.sh tests_v2 \ bash deploy/main_control.sh dynamic_tests_v2 \
tests_v2/configs \ tests_v2/configs \
${logs_dir} \ ${logs_dir} \
$VISIBLE_DEVICES \ $VISIBLE_DEVICES \
......
...@@ -958,7 +958,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [ ...@@ -958,7 +958,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', 'test_pow', 'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', 'test_pow',
'test_inplace_softmax_with_cross_entropy', 'test_transforms', 'test_inplace_softmax_with_cross_entropy', 'test_transforms',
'test_unfold_op', 'test_assign_op', 'test_isinstance', 'test_unfold_op', 'test_assign_op', 'test_isinstance',
'test_conv_affine_channel_fuse_pass',
'auto_growth_best_fit_allocator_facade_test', 'test_cholesky_op', 'auto_growth_best_fit_allocator_facade_test', 'test_cholesky_op',
'test_adaptive_avg_pool3d', 'test_paddle_save_load_binary', 'test_adaptive_avg_pool3d', 'test_paddle_save_load_binary',
'test_fused_fc_elementwise_layernorm_op', 'test_sequence_enumerate_op', 'test_fused_fc_elementwise_layernorm_op', 'test_sequence_enumerate_op',
...@@ -1873,7 +1872,6 @@ TETRAD_PARALLEL_JOB = [ ...@@ -1873,7 +1872,6 @@ TETRAD_PARALLEL_JOB = [
'test_dataloader_unkeep_order', 'test_dataloader_unkeep_order',
'test_parallel_executor_profiler', 'test_parallel_executor_profiler',
'test_correlation', 'test_correlation',
'test_conv_affine_channel_fuse_pass',
'test_ir_inplace_pass', 'test_ir_inplace_pass',
'test_moving_average_abs_max_scale_op', 'test_moving_average_abs_max_scale_op',
'test_flatten_contiguous_range_op', 'test_flatten_contiguous_range_op',
......
...@@ -578,7 +578,6 @@ STATIC_MODE_TESTING_LIST = [ ...@@ -578,7 +578,6 @@ STATIC_MODE_TESTING_LIST = [
'test_ir_embedding_eltwise_layernorm_fuse_pass', 'test_ir_embedding_eltwise_layernorm_fuse_pass',
'test_ir_fc_fuse_pass', 'test_ir_fc_fuse_pass',
'test_ir_skip_layernorm_pass', 'test_ir_skip_layernorm_pass',
'test_conv_affine_channel_fuse_pass',
'test_conv_bias_mkldnn_fuse_pass', 'test_conv_bias_mkldnn_fuse_pass',
'test_conv_bn_fuse_pass', 'test_conv_bn_fuse_pass',
'test_conv_elementwise_add2_act_fuse_pass', 'test_conv_elementwise_add2_act_fuse_pass',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册