提交 a4bccde0 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_sgd_to_phi

......@@ -36,7 +36,7 @@ ENDIF()
if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
......
......@@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME)
else()
xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
find_fluid_modules(${TARGET_NAME})
find_phi_modules(${TARGET_NAME})
endif()
if (xpu_library_DEPS)
add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
......
......@@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST)
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./gpudnn\/")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./kps\/")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
else ()
# deal with device independent kernel, now we use CPU temporaary
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
......@@ -97,6 +99,7 @@ function(kernel_library TARGET)
set(gpu_srcs)
set(xpu_srcs)
set(gpudnn_srcs)
set(kps_srcs)
set(selected_rows_srcs)
# parse and save the deps kerenl targets
set(all_srcs)
......@@ -128,6 +131,9 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
endif()
......@@ -137,6 +143,15 @@ function(kernel_library TARGET)
list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
endif()
endif()
if (WITH_XPU_KP)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
# Change XPU2 file suffix
# NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps)
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
endif()
endif()
else()
# TODO(chenweihang): impl compile by source later
endif()
......@@ -150,6 +165,7 @@ function(kernel_library TARGET)
list(APPEND all_srcs ${gpu_srcs})
list(APPEND all_srcs ${xpu_srcs})
list(APPEND all_srcs ${gpudnn_srcs})
list(APPEND all_srcs ${kps_srcs})
foreach(src ${all_srcs})
file(READ ${src} target_content)
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
......@@ -176,11 +192,20 @@ function(kernel_library TARGET)
list(LENGTH gpu_srcs gpu_srcs_len)
list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH gpudnn_srcs gpudnn_srcs_len)
list(LENGTH kps_srcs kps_srcs_len)
list(LENGTH selected_rows_srcs selected_rows_srcs_len)
# kernel source file level
# level 1: base device kernel
# - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
# level 2: device-independent kernel
# - common_srcs
# level 3: Kernel implemented by reusing device-independent kernel
# - selected_rows_srcs
# Build Target according different src organization
if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND
${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
(${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
# If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
if (WITH_GPU)
......@@ -193,6 +218,11 @@ function(kernel_library TARGET)
hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
elseif (WITH_XPU_KP)
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
......@@ -200,7 +230,7 @@ function(kernel_library TARGET)
endif()
endif()
# If there are only specific device srcs, build target using this rule.
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
......@@ -209,6 +239,10 @@ function(kernel_library TARGET)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
elseif (WITH_XPU_KP)
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
......@@ -222,6 +256,9 @@ function(kernel_library TARGET)
elseif (WITH_ROCM)
hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
else()
cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
......@@ -232,6 +269,8 @@ function(kernel_library TARGET)
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else()
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
......@@ -240,6 +279,8 @@ function(kernel_library TARGET)
nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else()
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
......@@ -249,7 +290,7 @@ function(kernel_library TARGET)
if (${target_build_flag} EQUAL 1)
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
# append target into PHI_KERNELS property
get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
......@@ -275,6 +316,9 @@ function(kernel_library TARGET)
if (${gpudnn_srcs_len} GREATER 0)
kernel_declare(${gpudnn_srcs})
endif()
if (${kps_srcs_len} GREATER 0)
kernel_declare(${kps_srcs})
endif()
if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs})
endif()
......
cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
if(WITH_NCCL)
cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/reducer.h"
#include "paddle/phi/common/data_type.h"
namespace paddle {
namespace distributed {
std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
const std::vector<Tensor> tensors,
const std::vector<bool> &is_sparse_gradient,
const std::vector<size_t> &group_size_limits,
const std::vector<int64_t> &tensor_indices) {
PADDLE_ENFORCE_EQ(
tensors.size(), is_sparse_gradient.size(),
platform::errors::PreconditionNotMet(
"tensors len must be equal to is_sparse_gradient len, but "
"[%lu] != [%lu]",
tensors.size(), is_sparse_gradient.size()));
auto check_perm = [](const std::vector<int64_t> &x) -> bool {
size_t len = x.size();
std::vector<size_t> cnt(len, 0);
for (size_t i = 0; i < len; ++i) {
if (x[i] >= static_cast<int64_t>(len) || x[i] < 0 || cnt[x[i]]) {
return false;
}
cnt[x[i]]++;
}
return true;
};
PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices),
platform::errors::PreconditionNotMet(
"tensor_indices must be a permutation from 0 to %lu",
tensor_indices.size()));
// the return vector
std::vector<std::vector<size_t>> res;
// Key: the var type
// Value: should use which index in group_size_limits for group size limit
std::map<experimental::DataType, size_t> group_limit_index;
// Key: the var type
// Value: <the var index in input tensors, total numel in this group>
std::map<experimental::DataType, std::pair<std::vector<size_t>, size_t>>
next_group;
for (size_t i = 0; i < tensors.size(); ++i) {
const auto &var = tensors[i];
size_t tensor_real_index = i;
if (!tensor_indices.empty()) {
tensor_real_index = tensor_indices[i];
}
if (is_sparse_gradient[tensor_real_index]) {
// we keep sparse var a single group
res.push_back({tensor_real_index});
continue;
}
const auto &var_dtype = var.dtype();
VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype;
auto &group_info = next_group[var_dtype];
int64_t var_size = -1;
if (var.is_dense_tensor()) {
var_size =
std::dynamic_pointer_cast<phi::DenseTensor>(var.impl())->numel();
} else {
VLOG(3) << "var " << var.name()
<< " is not tensor or selected_rows, so skip it";
continue;
}
group_info.first.push_back(tensor_real_index);
group_info.second += experimental::SizeOf(var_dtype) * var_size;
// group_info.second += framework::SizeOfType(var_dtype) * var_size;
if (group_limit_index.find(var_dtype) == group_limit_index.end()) {
// means it is the first var of var_dtype
group_limit_index[var_dtype] = 0;
}
auto &cur_limit_index = group_limit_index[var_dtype];
if (group_info.second >= group_size_limits[cur_limit_index]) {
// exceed group capacity and create a new group
res.emplace_back(std::move(group_info.first));
group_info = std::pair<std::vector<size_t>, size_t>();
cur_limit_index =
(std::min)(cur_limit_index + 1, group_size_limits.size() - 1);
}
}
// add the final groups
for (auto &e : next_group) {
auto &group_info = e.second;
if (!group_info.first.empty()) {
res.emplace_back(std::move(group_info.first));
}
}
for (const auto &group_index : res) {
PADDLE_ENFORCE_NE(
group_index.empty(), true,
platform::errors::PreconditionNotMet(
"AssignGroupBySize construct empty group, please check."));
}
if (tensor_indices.empty()) {
std::sort(res.begin(), res.end(),
[](const std::vector<size_t> &x, const std::vector<size_t> &y) {
return x.front() < y.front();
});
}
return res;
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,41 +14,19 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include <map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the Conv and ConvAffineChannel.
*/
class Graph;
class ConvAffineChannelFusePass : public FusePassBase {
public:
ConvAffineChannelFusePass();
virtual ~ConvAffineChannelFusePass() {}
protected:
void ApplyImpl(ir::Graph*) const override;
const std::string name_scope_{"conv_affine_channel_fuse"};
};
class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
public:
ConvEltwiseAddAffineChannelFusePass();
virtual ~ConvEltwiseAddAffineChannelFusePass() {}
namespace distributed {
using Tensor = paddle::experimental::Tensor;
protected:
void ApplyImpl(ir::Graph*) const override;
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
};
std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
const std::vector<size_t>& group_size_limits,
const std::vector<int64_t>& tensor_indices = {});
} // namespace ir
} // namespace framework
} // namespace distributed
} // namespace paddle
......@@ -308,22 +308,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
// TODO(chenweihang): support multiple inputs and outputs later
phi::InferMetaContext infer_mete_context;
for (auto& in_name : input_names) {
if (ctx->HasInput(in_name)) {
infer_meta_context.EmplaceBackInput(std::make_shared<CompatMetaTensor>(
ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime()));
if (ctx->HasInputs(in_name)) {
auto input_var = ctx->GetInputVarPtrs(in_name);
if (input_var.size() == 1) {
infer_meta_context.EmplaceBackInput(
std::make_shared<CompatMetaTensor>(input_var[0], ctx->IsRuntime()));
} else {
infer_meta_context.EmplaceBackInput({nullptr});
paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> inputs;
inputs.reserve(input_var.size());
for (const auto& in : input_var) {
inputs.push_back(
std::make_shared<CompatMetaTensor>(in, ctx->IsRuntime()));
}
infer_meta_context.EmplaceBackInputs(std::move(inputs));
}
for (auto& out_name : output_names) {
if (ctx->HasOutput(out_name)) {
infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
} else {
infer_meta_context.EmplaceBackOutput({nullptr});
infer_meta_context.EmplaceBackInput({nullptr});
}
}
auto attr_reader = ctx->Attrs();
for (size_t i = 0; i < attr_names.size(); ++i) {
auto attr_name = attr_names[i];
......@@ -348,13 +351,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
}
} else {
// If is not in runtime, we will set default value(-1) for ScalarArray
int64_t num_ele = 0;
std::vector<VarDesc*> vars;
vars.reserve(infershape_inputs.size());
for (size_t i = 0; i < infershape_inputs.size(); i++) {
for (size_t i = 0; i < infershape_inputs.size(); ++i) {
vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
}
int64_t num_ele = 0;
if (vars.size() == 1) {
num_ele = 1;
const auto& tensor_dims = vars[0]->GetShape();
......@@ -362,16 +365,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
num_ele *= tensor_dims[i];
}
} else {
for (auto& var : vars) {
const auto& tensor_dims = var->GetShape();
PADDLE_ENFORCE_EQ(tensor_dims.size(), 1,
platform::errors::InvalidArgument(
"The shape is constructed by multi-tensor, "
"every tensor's dims should be 1. But your "
"shape has tensor that dims is %s.",
tensor_dims.size()));
num_ele += tensor_dims[0];
}
num_ele = vars.size();
}
phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
tensor_attr.SetFromTensor(true);
......@@ -383,10 +377,14 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
std::type_index(typeid(std::vector<int32_t>))) {
infer_meta_context.EmplaceBackAttr(std::move(
phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(int))) {
infer_meta_context.EmplaceBackAttr(
phi::ScalarArray({BOOST_GET_CONST(int, attr)}));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported cast op attribute `%s` to ScalarArray when "
"construct KernelContext.",
"construct InferMetaContext.",
attr_name));
}
}
......@@ -414,7 +412,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
}
} else if (ctx->HasInput(attr_name)) {
const auto& infershape_input = ctx->GetInputVarPtrs(attr_name);
if (infershape_input.size() == 1) {
if (ctx->IsRuntime()) {
Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
......@@ -490,6 +487,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
"Unsupported attribute type is received when call "
"InferShapeFunctor."));
}
} else {
// do nothing
}
}
for (auto& out_name : output_names) {
if (ctx->HasOutputs(out_name)) {
auto output_var = ctx->GetOutputVarPtrs(out_name);
if (output_var.size() == 1) {
infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
output_var[0], ctx->IsRuntime()));
} else {
paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> outputs;
outputs.reserve(output_var.size());
for (const auto& out : output_var) {
outputs.emplace_back(
std::make_shared<CompatMetaTensor>(out, ctx->IsRuntime()));
}
infer_meta_context.EmplaceBackOutputs(std::move(outputs));
}
} else {
infer_meta_context.EmplaceBackOutput({nullptr});
}
}
......
......@@ -78,7 +78,6 @@ pass_library(is_test_pass base)
pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base)
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
#include <cmath>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace ir {
class Node;
#define GET_CONV_BN_NODES(pattern_name) \
/* OPERATORS */ \
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
/* CONV inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \
/* CONV outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \
/* Affine Channel inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \
/* Affine channel outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
const ir::Node& ac_scale,
const LoDTensor& ac_bias_tensor,
LoDTensor* eltwise_y_in_tensor) {
using EigenVectorArrayMap =
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using EigenMatrixArrayMap = Eigen::Map<
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
// Re-compute bias of conv2d from AffineChannel
PADDLE_ENFORCE_EQ(
eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
platform::errors::InvalidArgument(
"Tensor elementwise y(%d) and activation bias(%d) must have same "
"dimension.",
eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
scale_tensor->numel(), 1);
ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
ac_bias_tensor.numel(), 1);
EigenVectorArrayMap eltwise_y_in_array(
eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 1);
eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
// Re-compute weight of conv2d from AffineChannel
auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
auto weights_shape = weights->dims();
auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
weights_shape_2d[1]);
weights_array_2d.colwise() *= scale_array;
// Check for subnormal values that slows down convolution execution
for (int i = 0; i < weights->numel(); ++i) {
if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
}
}
ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
AddOpCompat(OpCompat("conv2d"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("Filter")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddInput("ResidualData")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Output")
.IsTensor()
.End()
.AddAttr("strides")
.IsType<std::vector<int>>()
.End()
.AddAttr("paddings")
.IsType<std::vector<int>>()
.End()
.AddAttr("padding_algorithm")
.IsOptional()
.IsStringIn({"EXPLICIT", "SAME", "VALID"})
.End()
.AddAttr("groups")
.IsNumGE(1)
.End()
.AddAttr("dilations")
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("affine_channel"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Scale")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("data_layout")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("axis")
.IsNumEQ(1)
.End();
}
void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
FusePassBase::Init(name_scope_, graph);
auto* scope = param_scope();
PADDLE_ENFORCE_NOT_NULL(
scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
if (!IsCompat(subgraph, g)) {
LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed.";
return;
}
VLOG(4) << "handle ConvAffineChannel fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
if (data_format == "AnyLayout") {
LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
"it's wrong if data_format of conv is not "
"NCHW.";
}
// Get affine_channel bias for resizing eltwise_y!
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
// Create eltwise_y (conv bias) variable
VarDesc eltwise_y_in_desc(
patterns::PDNodeName(name_scope_, "eltwise_y_in"));
// Set shape && datatype manually
eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
eltwise_y_in_desc.SetDataType(
framework::TransToProtoVarType(ac_bias_tensor->dtype()));
eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
eltwise_y_in_desc.SetPersistable(true);
// Initialize eltwise_y
auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
auto* eltwise_y_in_tensor =
scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 0.0f);
// update weights and biases
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// create an elementwise add node.
OpDesc desc;
desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
desc.SetType("elementwise_add");
desc.SetAttr("axis", 1);
desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
IR_NODE_LINK_TO(conv_out, eltwise_op);
IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
IR_NODE_LINK_TO(eltwise_op, ac_out);
found_conv_ac_count++;
};
gpd(graph, handler);
AddStatis(found_conv_ac_count);
}
ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
AddOpCompat(OpCompat("conv2d"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("Filter")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddInput("ResidualData")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Output")
.IsTensor()
.End()
.AddAttr("strides")
.IsType<std::vector<int>>()
.End()
.AddAttr("paddings")
.IsType<std::vector<int>>()
.End()
.AddAttr("padding_algorithm")
.IsOptional()
.IsStringIn({"EXPLICIT", "SAME", "VALID"})
.End()
.AddAttr("groups")
.IsNumGE(1)
.End()
.AddAttr("dilations")
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("affine_channel"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Scale")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("data_layout")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("axis")
.IsNumEQ(1)
.End();
}
void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
FusePassBase::Init(name_scope_, graph);
auto* scope = param_scope();
PADDLE_ENFORCE_NOT_NULL(
scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
if (!IsCompat(subgraph, g)) {
LOG(WARNING)
<< "ConvEltwiseAddAffineChannelFusePass in op compat failed.";
return;
}
VLOG(4) << "handle ConvBN fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
if (data_format == "AnyLayout") {
LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is "
"enabled, it's wrong if data_format of conv "
"is not NCHW.";
}
// OPERATORS
GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
// BIAS inputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
// BIAS outputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
// Get eltwise_y (conv bias) variable
auto* eltwise_y_in_tensor =
scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
// Get batch norm bias
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// Update the elementwise_add node
eltwise->Op()->SetAttr("axis", 1);
eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
GraphSafeRemoveNodes(graph,
{ac_scale, ac_bias, affine_channel, eltwise_out});
IR_NODE_LINK_TO(eltwise, ac_out);
found_conv_ac_count++;
};
gpd(graph, handler);
AddStatis(found_conv_ac_count);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_affine_channel_fuse_pass,
paddle::framework::ir::ConvAffineChannelFusePass);
REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.LE("conv2d", 1)
.EQ("affine_channel", 0));
REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.LE("conv2d", 1)
.LE("elementwise_add", 1)
.EQ("affine_channel", 0));
......@@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
}
pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
}
VLOG(4) << "Done inputs";
for (size_t i = 0; i < output_names.size(); ++i) {
auto it = ctx.outputs.find(output_names[i]);
......@@ -2107,17 +2108,12 @@ void OperatorWithKernel::BuildPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type())));
}
experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
output_defs.at(i));
SetAllocationForOutputTenosr(
tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
}
pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
}
VLOG(4) << "Done outputs";
for (size_t i = 0; i < attr_names.size(); ++i) {
if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
......@@ -2226,6 +2222,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
}
}
}
VLOG(4) << "Done attributes";
}
} // namespace framework
......
......@@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
library_type = LibraryType::kMKLDNN;
} else if (kernel_key.backend() == phi::Backend::GPUDNN) {
library_type = LibraryType::kCUDNN;
} else if (kernel_key.backend() == phi::Backend::KPS) {
library_type = LibraryType::kKP;
} else {
// do nothing
}
......@@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey(
backend = phi::Backend::MKLDNN;
} else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
backend = phi::Backend::GPUDNN;
} else if (kernel_type.library_type_ == LibraryType::kKP) {
backend = phi::Backend::KPS;
} else {
// do
}
......@@ -229,26 +233,5 @@ static void SetAllocationForUninitializedDenseTensor(
dense_tensor->ResetHolder(shared_allocation);
}
void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
const platform::Place& place) {
if (phi::DenseTensor::classof(tensor)) {
auto* dense_tensor = static_cast<phi::DenseTensor*>(tensor);
if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) {
SetAllocationForUninitializedDenseTensor(dense_tensor, place);
}
} else if (phi::SelectedRows::classof(tensor)) {
auto* selected_rows = static_cast<phi::SelectedRows*>(tensor);
if (!selected_rows->value().IsInitialized() ||
!(selected_rows->place() == place)) {
SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(),
place);
}
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported tensor type is received when setting allocation for "
"output tensor."));
}
}
} // namespace framework
} // namespace paddle
......@@ -62,9 +62,6 @@ class KernelArgsNameMaker {
void InitDefaultKernelSignatureMap();
void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
const platform::Place& place);
// TODO(Wilber): support others device context.
template <typename T>
struct ConvertToPhiContext {
......
......@@ -323,12 +323,6 @@ void BuildDygraphPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type())));
}
experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
output_defs.at(i));
framework::SetAllocationForOutputTenosr(
tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
}
kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
......
......@@ -75,9 +75,7 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
const std::vector<std::string> kTRTSubgraphPasses({
"conv_affine_channel_fuse_pass", //
"adaptive_pool2d_convert_global_pass",
"conv_eltwiseadd_affine_channel_fuse_pass", //
"shuffle_channel_detect_pass", //
"quant_conv2d_dequant_fuse_pass", //
"delete_quant_dequant_op_pass", //
......@@ -136,8 +134,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
// "identity_scale_op_clean_pass", //
"is_test_pass", //
"simplify_with_basic_ops_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"embedding_eltwise_layernorm_fuse_pass", //
......@@ -239,8 +235,6 @@ void CpuPassStrategy::EnableMKLDNN() {
"depthwise_conv_mkldnn_pass", //
"conv_bn_fuse_pass", // Execute BN passes again to
"conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_transpose_bn_fuse_pass", //
"conv_transpose_eltwiseadd_bn_fuse_pass", //
"conv_bias_mkldnn_fuse_pass", //
......
......@@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext;
ops::CastOpKernel<CUDA, plat::complex<float>>, \
ops::CastOpKernel<CUDA, plat::complex<double>>, ##__VA_ARGS__);
#if !defined(PADDLE_WITH_HIP)
// See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel<CUDA, plat::bfloat16>)
#else
REGISTER_CAST_CUDA_BASE(transfer_dtype)
#endif
......@@ -18,7 +18,9 @@ limitations under the License. */
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/multiary.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h"
#ifdef PADDLE_WITH_MKLDNN
......@@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat");
auto inputs_dims = ctx->GetInputsDim("X");
const size_t inputs_num = inputs_dims.size();
PADDLE_ENFORCE_GT(
inputs_num, static_cast<size_t>(0),
platform::errors::InvalidArgument(
"The number of input tensors in concat op should > 0. But "
"received inputs' length is 0."));
if (inputs_num == 1) {
VLOG(3) << "Warning: concat op have only one input, may waste memory";
}
if (ctx->HasInput("AxisTensor")) {
auto out_dims =
phi::make_ddim(std::vector<int>(inputs_dims[0].size(), -1));
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
} else {
size_t axis =
ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
static_cast<int64_t>(inputs_dims[0].size()));
framework::DDim out_dims =
phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis);
if (out_dims[axis] < 0) {
out_dims[axis] = -1;
}
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
......@@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
PT_INFER_META(phi::ConcatInferMeta));
REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
ops::ConcatGradOpMaker<paddle::imperative::OpBase>);
ops::ConcatGradOpMaker<paddle::imperative::OpBase>,
ConcatInferShapeFunctor);
REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
ops::ConcatDoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ConcatDoubleGradOpMaker<paddle::imperative::OpBase>,
......
......@@ -20,5 +20,5 @@ else()
endif()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n")
file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
......@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace paddle {
namespace operators {
......@@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp {
::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
paddle::operators::LogicalAndFunctor);
REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
paddle::operators::LogicalOrFunctor);
REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
paddle::operators::LogicalNotFunctor);
REGISTER_BINARY_LOGICAL_OP(logical_xor,
"$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
paddle::operators::LogicalXorFunctor);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
namespace paddle {
namespace operators {
template <typename Functor>
class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using InT = typename Functor::ELEMENT_TYPE;
using OutT = bool;
auto functor = Functor();
std::vector<const framework::Tensor*> ins;
std::vector<framework::Tensor*> outs;
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
if (ins.size() == 1) {
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kUnary,
InT, OutT>(
cuda_ctx, ins, &outs, axis, functor);
} else {
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
InT, OutT>(
cuda_ctx, ins, &outs, axis, functor);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \
REGISTER_OP_CUDA_KERNEL( \
op_name, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int8_t>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int16_t>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int64_t>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor)
REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor)
REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor)
REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor)
#undef REGISTER_LOGICAL_CUDA_KERNEL
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <type_traits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
#define LOGICAL_BINARY_FUNCTOR(func_name, op) \
template <typename T> \
struct func_name { \
using ELEMENT_TYPE = T; \
HOSTDEVICE bool operator()(const T a, const T b) const { \
return static_cast<bool>(a) op static_cast<bool>(b); \
} \
};
LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
#undef LOGICAL_BINARY_FUNCTOR
template <typename T>
struct LogicalNotFunctor {
using ELEMENT_TYPE = T;
HOSTDEVICE bool operator()(const T a) const { return !a; }
};
template <typename DeviceContext, typename Functor>
class BinaryLogicalOpKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEMENT_TYPE;
auto* x = context.Input<framework::Tensor>("X");
auto* y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
Functor binary_func;
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
binary_func, out);
}
};
template <typename DeviceContext, typename Functor>
class UnaryLogicalOpKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEMENT_TYPE;
auto* x = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
Functor unary_func;
platform::Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), x->data<T>(),
x->data<T>() + x->numel(),
out->mutable_data<bool>(context.GetPlace()), unary_func);
}
};
} // namespace operators
} // namespace paddle
#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
REGISTER_OP_##dev##_KERNEL( \
op_type, ::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<bool>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int8_t>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int16_t>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int64_t>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<float>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<double>>);
#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
REGISTER_OP_##dev##_KERNEL( \
op_type, ::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<bool>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int8_t>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int16_t>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int64_t>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<float>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<double>>);
......@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -14,6 +14,10 @@
#include "paddle/fluid/operators/dot_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
......@@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"),
platform::errors::PreconditionNotMet(
"Input(X) of DotOp should not be null."));
PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"),
platform::errors::PreconditionNotMet(
"Input(Y) of DotOp should not be null."));
PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"),
platform::errors::PreconditionNotMet(
"Output(Out) of DotOp should not be null."));
auto x_dims = ctx->GetInputDim("X");
auto x_rank = static_cast<size_t>(x_dims.size());
PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank,
platform::errors::PreconditionNotMet(
"ShapeError: The dimensions of input tensor X (%s) "
"should be 1 or 2",
x_dims.to_str()));
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(
true, x_rank == (size_t)y_dims.size(),
platform::errors::PreconditionNotMet(
"ShapeError: The shape of input tensor Y: %s should match with "
"input tenosr X: %s",
y_dims.to_str(), x_dims.to_str()));
bool shape_match = true;
for (size_t i = 0; i < x_rank; ++i) {
if (x_dims[i] != y_dims[i]) {
shape_match = false;
break;
}
}
PADDLE_ENFORCE_EQ(true, shape_match,
platform::errors::PreconditionNotMet(
"ShapeError: The shape of input tensor X: %s should "
"be exactly the same "
"with input tensor Y: %s",
x_dims.to_str(), y_dims.to_str()));
auto dims = vectorize(x_dims);
dims[dims.size() - 1] = 1;
ctx->SetOutputDim("Out", phi::make_ddim(dims));
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
......@@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
PT_INFER_META(phi::DotInferMeta));
REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
ops::DotOpGradMaker<paddle::framework::OpDesc>,
ops::DotOpGradMaker<paddle::imperative::OpBase>);
ops::DotOpGradMaker<paddle::imperative::OpBase>,
DotInferShapeFunctor);
REGISTER_OPERATOR(dot_grad, ops::DotGradOp);
......
......@@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
ops::GatherOpKernel<uint8_t>,
ops::GatherOpKernel<int64_t>);
ops::GatherOpKernel<int64_t>,
ops::GatherOpKernel<phi::dtype::bfloat16>);
REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
ops::GatherGradientOpKernel<double>,
ops::GatherGradientOpKernel<int>,
ops::GatherGradientOpKernel<uint8_t>,
ops::GatherGradientOpKernel<int64_t>);
ops::GatherGradientOpKernel<int64_t>,
ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
REGISTER_OP_VERSION(gather)
.AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
paddle::framework::compatible::OpVersionDesc().NewInput(
......
......@@ -130,9 +130,11 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>,
ops::GatherOpCUDAKernel<plat::float16>);
ops::GatherOpCUDAKernel<plat::float16>,
ops::GatherOpCUDAKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
ops::GatherGradOpCUDAKernel<double>,
ops::GatherGradOpCUDAKernel<int64_t>,
ops::GatherGradOpCUDAKernel<int>,
ops::GatherGradOpCUDAKernel<plat::float16>);
ops::GatherGradOpCUDAKernel<plat::float16>,
ops::GatherGradOpCUDAKernel<plat::bfloat16>);
......@@ -29,6 +29,7 @@ namespace operators {
using DataLayout = framework::DataLayout;
enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
#define ALIGN_BYTES 16
#define CHECK_CASE(i, flags, kernel_name, ...) \
if (i == flags) { \
......@@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
template <typename T>
__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
int imsize, int groups,
int group_size, T* mean, T* var,
const DataLayout data_layout) {
int group_size, T* mean, T* var) {
int gid = blockIdx.y;
int cid = blockIdx.x;
int bid = blockIdx.z;
......@@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
T x_mean = 0, x_var = 0;
for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
T val;
if (data_layout == DataLayout::kNCHW) {
val = x[(bid * C + ccid) * imsize + imid];
} else {
int hid = imid / W;
int wid = imid % W;
val = x[(bid * H + hid) * W * C + wid * C + ccid];
}
x_mean += val;
x_var += val * val;
}
......@@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
}
template <typename T, typename AccT, int VecSize>
__device__ __forceinline__ void ThreadReduce(const T* input, int size,
const int offset, AccT* mean,
AccT* var) {
using VecT = kps::details::VectorType<T, VecSize>;
int tid = threadIdx.x;
if (offset > 0) {
input -= offset;
size += offset;
if (tid >= offset) {
AccT temp = input[tid];
*mean += temp;
*var += temp * temp;
}
size -= blockDim.x;
input += blockDim.x;
}
int remain = size % (VecSize * blockDim.x);
T ins[VecSize];
VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
// vector part
for (; VecSize * tid < (size - remain); tid += blockDim.x) {
*ins_vec = reinterpret_cast<const VecT*>(input)[tid];
#pragma unroll
for (int i = 0; i < VecSize; ++i) {
AccT temp = ins[i];
*mean += temp;
*var += temp * temp;
}
}
// scalar part
tid = size - remain + threadIdx.x;
for (; tid < size; tid += blockDim.x) {
AccT temp = input[tid];
*mean += temp;
*var += temp * temp;
}
}
template <typename T>
__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
int i = blockIdx.x;
T x_mean = 0, x_var = 0;
for (int j = threadIdx.x; j < size; j += blockDim.x) {
T val;
val = x[i * size + j];
x_mean += val;
x_var += val * val;
}
x_mean /= size;
x_var /= size;
CudaAtomicAddWithWarp(&mean[i], x_mean);
CudaAtomicAddWithWarp(&var[i], x_var);
}
template <typename T, typename AccT, int VecSize>
__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
int size) {
int i = blockIdx.x;
AccT x_mean = static_cast<AccT>(0);
AccT x_var = static_cast<AccT>(0);
const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
x += i * size;
ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
x_mean, kps::AddFunctor<AccT>());
x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
x_var, kps::AddFunctor<AccT>());
__syncthreads();
if (threadIdx.x == 0) {
mean[i] = static_cast<T>(x_mean / size);
var[i] = static_cast<T>(x_var / size);
}
}
template <typename T, int flags>
__global__ void GroupNormForward(const T* x, const T* mean, const T* var,
const T* scale, const T* bias, int N, int C,
......@@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
int H = imsize / W;
int ccid = gid * group_size + cid;
if (ccid >= C) return;
T x_mean = mean[bid * groups + gid];
T x_var = var[bid * groups + gid];
auto ng = bid * groups + gid;
T x_mean = mean[ng];
T x_var = var[ng];
x_var = x_var - x_mean * x_mean;
T var_inv = 1.0 / sqrt(x_var + epsilon);
if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
T var_inv = rsqrt(x_var + epsilon);
if (cid == 0 && threadIdx.x == 0) {
real_var[ng] = x_var;
}
for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
T val;
int hid, wid;
int index = (bid * C + ccid) * imsize + imid;
if (data_layout == DataLayout::kNCHW) {
val = x[(bid * C + ccid) * imsize + imid];
val = x[index];
} else {
hid = imid / W;
wid = imid % W;
val = x[(bid * H + hid) * W * C + wid * C + ccid];
}
val = (val - x_mean) * var_inv;
if (flags & kHasScale) val *= scale[gid * group_size + cid];
if (flags & kHasBias) val += bias[gid * group_size + cid];
if (flags & kHasScale) {
val *= scale[ccid];
}
if (flags & kHasBias) {
val += bias[ccid];
}
if (data_layout == DataLayout::kNCHW) {
y[(bid * C + ccid) * imsize + imid] = val;
y[index] = val;
} else {
y[(bid * H + hid) * W * C + wid * C + ccid] = val;
}
......@@ -182,16 +266,41 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
imsize *= x_dims[i];
}
}
#ifdef __HIPCC__
int block_size = std::max(std::min(256, imsize), 64);
#else
int block_size = std::min(1024, imsize);
#endif
dim3 grid(group_size, groups, x_dims[0]);
dim3 threads(block_size, 1, 1);
if (data_layout == DataLayout::kNCHW) {
using AccT = typename details::MPTypeTrait<T>::Type;
constexpr int vec_size = sizeof(float4) / sizeof(T);
int size = group_size * imsize;
const int max_num_threads = 1024;
int max_block_size = std::min(size / vec_size, max_num_threads);
int block_size_nchw = 1;
while (block_size_nchw < max_block_size) {
block_size_nchw *= 2;
}
block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
dim3 grids(x_dims[0] * groups);
dim3 blocks(block_size_nchw);
if (size < vec_size) {
ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
x_data, mean_data, temp_var_data, size);
} else {
VectorizedGetMeanAndVarNCHW<
T, AccT, vec_size><<<grids, blocks, 0, dev_ctx.stream()>>>(
x_data, mean_data, temp_var_data, size);
}
} else {
GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
temp_var_data, data_layout);
temp_var_data);
}
int flags =
(scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,
......
......@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/index_sample_op.h"
#include <vector>
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
class IndexSampleOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Inputs(Input) of FindByIndex should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
platform::errors::InvalidArgument(
"Inputs(Index) of FindByIndex should not be null."));
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
input_dims.size(), 2,
platform::errors::InvalidArgument(
"Inputs(X) shape of IndexSample op should be 2-D, but "
"got X's shape = [%s], please check X shape.",
input_dims));
auto index_dims = ctx->GetInputDim("Index");
PADDLE_ENFORCE_EQ(
input_dims.size(), 2,
platform::errors::InvalidArgument(
"Inputs(Index) shape of IndexSample op should be 2-D, but "
"got Index's shape [%s] , please check index shape.",
input_dims));
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0],
platform::errors::InvalidArgument(
"Inputs(X)'s value of dimension 0 must same with "
"Inputs(Index)'s value of dimension 0, but "
"got %d of Inputs(X), and got %d of Inputs(Index), "
"please check Inputs shape.",
input_dims[0], index_dims[0]));
}
ctx->SetOutputDim("Out", index_dims);
auto type = ctx->GetInputsVarType("Index")[0];
if (type == framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("Index", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
......@@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
} // namespace paddle
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
PT_INFER_META(phi::IndexSampleInferMeta));
REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
ops::IndexSampleGradMaker<paddle::imperative::OpBase>);
ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
IndexSampleInferShapeFunctor);
REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp,
ops::IndexSampleGradNoNeedBufferVarInferer);
REGISTER_OP_CPU_KERNEL(
index_sample,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
index_sample_grad,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/index_sample_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#define PREDEFINED_BLOCK_SIZE_X 512
#define PREDEFINED_BLOCK_SIZE 1024
#define MIN(a, b) ((a) < (b) ? (a) : (b))
namespace paddle {
namespace operators {
namespace {
void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
.GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
}
}
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, typename IndexT = int>
__global__ void IndexSampleForward(const IndexT* index, const T* in_data,
T* out_data, size_t index_length,
size_t input_length, size_t batch_size) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
}
}
}
template <typename T, typename IndexT = int>
__global__ void IndexSampleGrad(const IndexT* index, T* in_grad,
const T* out_grad, size_t index_length,
size_t input_length, size_t batch_size,
bool same_data_in_row = true) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
if (same_data_in_row) {
platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
out_grad[sample_idx]);
} else {
in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
}
}
}
}
template <typename T>
class IndexSampleKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<LoDTensor>("X");
auto* index = ctx.Input<LoDTensor>("Index");
auto* output = ctx.Output<LoDTensor>("Out");
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
const auto* in_data = input->data<T>();
auto* out_data = output->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
auto input_dim = input->dims();
auto index_dim = index->dims();
size_t batch_size = input_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
auto block_width = platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
int block_height =
platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length,
batch_size);
} else if (index_type == framework::proto::VarType::INT32) {
const int* index_data = index->data<int>();
IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length,
batch_size);
}
}
};
template <typename T>
class IndexSampleGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* output_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
auto* input_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
auto* index = ctx.Input<LoDTensor>("Index");
const auto* output_grad_data = output_grad->data<T>();
auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
auto input_num = input_grad->numel();
auto input_dim = input_grad->dims();
auto index_dim = index->dims();
size_t batch_size = index_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
bool same_data_in_index_row = index_length == 1 ? false : true;
auto block_width = platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
auto block_height =
platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
set_zero(dev_ctx, input_grad, static_cast<T>(0));
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data, input_grad_data, output_grad_data, index_length,
input_length, batch_size, same_data_in_index_row);
} else if (index_type == framework::proto::VarType::INT32) {
const int* index_data = index->data<int>();
IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data, input_grad_data, output_grad_data, index_length,
input_length, batch_size, same_data_in_index_row);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
index_sample,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
index_sample_grad,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include <fstream>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DDim = framework::DDim;
template <typename T, typename IndexT = int>
void IndexSampleInner(const framework::ExecutionContext &context,
const LoDTensor &input, const LoDTensor &index,
LoDTensor *output) {
auto input_dims = input.dims();
auto index_dims = index.dims();
int batch_size = input_dims[0];
auto value_length = input_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> input_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(input, context.device_context(),
&input_vec);
paddle::framework::TensorToVector(index, context.device_context(),
&index_vec);
std::vector<T> res(index_ids_num);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i], 0,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i], value_length,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
T v = input_vec[v_i];
VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
<< " value = " << v;
res[i] = v;
}
auto ddim = phi::make_ddim({batch_size, index_length});
output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(res, context.device_context(), output);
output->Resize(ddim);
}
template <typename DeviceContext, typename T>
class IndexSampleKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *input_var = ctx.InputVar("X");
auto *index_var = ctx.InputVar("Index");
auto &input_tensor = input_var->Get<LoDTensor>();
auto &index_tensor = index_var->Get<LoDTensor>();
auto *out_var = ctx.OutputVar("Out");
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
const auto &index_type =
framework::TransToProtoVarType(index_tensor.dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSampleInner<T, int>(ctx, input_tensor, index_tensor, out_tensor);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSampleInner<T, int64_t>(ctx, input_tensor, index_tensor, out_tensor);
}
}
};
template <typename T, typename IndexT = int>
void IndexSampleGradInner(const framework::ExecutionContext &context,
const LoDTensor &out_grad, const LoDTensor &index,
LoDTensor *x_grad) {
std::vector<T> out_grad_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(out_grad, context.device_context(),
&out_grad_vec);
paddle::framework::TensorToVector(index, context.device_context(),
&index_vec);
auto index_dims = index.dims();
auto x_grad_dims = x_grad->dims();
auto value_length = x_grad_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> x_grad_vec(x_grad->numel(), 0);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i], 0,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i], value_length,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
x_grad_vec[v_i] += out_grad_vec[i];
}
x_grad->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad);
x_grad->Resize(x_grad_dims);
}
template <typename DeviceContext, typename T>
class IndexSampleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *index_var = context.InputVar("Index");
auto *x_grad_var = context.OutputVar(framework::GradVarName("X"));
auto *out_grad_var = context.InputVar(framework::GradVarName("Out"));
auto &index_tensor = index_var->Get<LoDTensor>();
auto &out_grad_tensor = out_grad_var->Get<LoDTensor>();
auto *x_grad_tensor = x_grad_var->GetMutable<framework::LoDTensor>();
const auto &index_type =
framework::TransToProtoVarType(index_tensor.dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSampleGradInner<T, int>(context, out_grad_tensor, index_tensor,
x_grad_tensor);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSampleGradInner<T, int64_t>(context, out_grad_tensor, index_tensor,
x_grad_tensor);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/index_sample_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -474,11 +474,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
for (int it = 0; it < LDGS; it++) {
#pragma unroll
for (int jt = 0; jt < VecSize; jt++) {
U x_tmp = x[it][jt];
U x_tmp = static_cast<U>(x[it][jt]);
U y_tmp = var_cur_row * (x_tmp - mean_cur_row);
U dy_tmp = static_cast<U>(gamma[it][jt]) *
static_cast<U>(dout[it][jt]); // scale * dy
U dout_tmp = dout[it][jt]; // dy
U dout_tmp = static_cast<U>(dout[it][jt]); // dy
// used for get dx (row reduction)
sum_loss1 += dy_tmp; // scale * dy, sum_1
......
......@@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL(
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
#elif CUDNN_VERSION_MIN(8, 1, 0)
REGISTER_OP_CUDA_KERNEL(
layer_norm,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(
layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
plat::bfloat16>);
#else
REGISTER_OP_CUDA_KERNEL(
layer_norm,
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
......@@ -502,32 +503,29 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
out.mutable_value()->mutable_data<T>(
phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
int r =
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i;
}
auto* out_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>();
auto* y_data = out.mutable_value()->data<T>();
auto* x_data = input.value().data<T>();
int xm = input_rows.size();
int ym = merge_rows.size();
int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
auto r = xpu::add(context.x_context(), &input_data[i * input_width],
&out_data[out_i * input_width],
&out_data[out_i * input_width], n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API return wrong value[%d %s], ", r,
XPUAPIErrorMsg[r]));
}
xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
merge_rows.data(), ym * sizeof(int64_t));
memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
input_rows.data(), xm * sizeof(int64_t));
int r =
xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
x_rows_data, y_rows_data, xm, n, ym);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
}
void operator()(const platform::XPUDeviceContext& context,
......@@ -582,15 +580,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
{static_cast<int64_t>(merged_row_set.size()), input_width}),
context.GetPlace());
int r =
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
float* y_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
......@@ -603,17 +593,22 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
}
auto& input_rows = input->rows();
auto* x_data = input->value().data<T>();
int xm = input_rows.size();
int ym = merge_rows.size();
int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
auto r = xpu::add(
context.x_context(), input->value().data<T>() + i * input_width,
&out_data[out_i * input_width], &out_data[out_i * input_width], n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API return wrong value[%d %s], ", r,
XPUAPIErrorMsg[r]));
}
xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
merge_rows.data(), ym * sizeof(int64_t));
memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
input_rows.data(), xm * sizeof(int64_t));
int r =
xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
x_rows_data, y_rows_data, xm, n, ym);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
}
}
};
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -445,6 +446,7 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::bfloat16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>;
template struct MergeAdd<platform::CUDADeviceContext,
platform::complex<double>>;
......
......@@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker
"The fp32 beta1 power accumulator tensor. Its shape is [1].");
AddOutput("Beta2Pow",
"The fp32 beta2 power accumulator tensor. Its shape is [1].");
AddOutput("FusedIndices",
"The param index of each element in FP32FusedParam. Its shape is "
"[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
AddOutput(
"FusedParamOffsets",
"The numel offset of each parameter inside the FP32FusedParam. Its "
"shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
"+ n_2, ...].");
AddOutput("FP32ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp32_local_param_num + 1].");
AddOutput("FP16ShardFusedParamOffsets",
"+ n_2, ...]. It should be in CPUPlace.");
AddOutput(
"FP32ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp16_local_param_num + 1].");
"Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
AddOutput(
"WeightDecay",
"The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
"FP16ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
AddOutput("ParamInfo",
"The param info. It should be in CPUPlace, and its shape is [6]"
"CPUPlace, and its shape is [6]. It is "
"CPUPlace, and its shape is [8]. It is "
"[fp32_shard_param_start_idx, fp32_local_param_num, "
"fp32_global_param_num, fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num].");
"fp32_global_param_num, fp32_weight_decay_end_idx, "
"fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num, "
"fp16_weight_decay_end_idx].");
AddOutput("ParamOrder",
"The reordered parameter order. Inside this op, "
"the parameter would be reordered by data type and weight decay "
"value.");
AddOutput("ParamOut", "The output parameter list.").AsDuplicable();
AddOutput("MasterParamOut",
"The output master parameter list. It would share the memory of "
......@@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker
AddAttr<float>("beta1", "The initial value of Beta1Pow.");
AddAttr<float>("beta2", "The initial value of Beta2Pow.");
AddAttr<std::vector<float>>(
"weight_decay",
"The weight decay for each parameter. Its "
"shape is equal to the global parameter number.");
AddAttr<std::vector<int>>("apply_weight_decay",
"Whether to apply weight decay.");
AddAttr<int>("alignment", "The alignment in bytes for the fused tensors.");
AddAttr<int>("rank", "The global rank of the current process.");
AddAttr<int>("nranks", "The global world size.");
......
......@@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin,
<< ") , dtype = " << fused_out->dtype();
}
template <typename OffsetT, typename IndexT>
static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets,
IndexT *out,
int offset_num,
int out_num) {
CUDA_KERNEL_LOOP_TYPE(i, out_num, int) {
auto idx = phi::funcs::LowerBound(offsets, offset_num, i);
if (idx == offset_num || offsets[idx] != i) {
--idx;
}
out[i] = idx;
}
}
template <typename T>
static void CopyVectorToTensor(const std::vector<T> &src,
framework::Tensor *dst,
const platform::Place &place,
gpuStream_t stream) {
dst->Resize({static_cast<int64_t>(src.size())});
T *dst_ptr = dst->mutable_data<T>(place);
const T *src_ptr = src.data();
auto nbytes = src.size() * sizeof(T);
memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream);
}
template <typename T>
static void CopyVectorToCPUTensor(const std::vector<T> &src,
framework::Tensor *dst) {
......@@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector<T> &src,
std::memcpy(dst_ptr, src_ptr, nbytes);
}
static size_t ReorderParamGradInfoList(const std::vector<int> &flags,
std::vector<ParamGradInfo> *infos) {
size_t n = infos->size();
std::vector<int> cur_flags;
cur_flags.reserve(n);
for (size_t i = 0; i < n; ++i) {
auto idx = (*infos)[i].idx;
cur_flags.push_back(flags[idx]);
}
auto origin_infos = *infos;
size_t j = 0;
for (size_t i = 0; i < n; ++i) {
if (cur_flags[i]) {
(*infos)[j] = origin_infos[i];
++j;
}
}
size_t ret_idx = j;
for (size_t i = 0; i < n; ++i) {
if (!cur_flags[i]) {
(*infos)[j] = origin_infos[i];
++j;
}
}
return ret_idx;
}
template <typename T>
static T ClipByBound(T x, T low_value, T high_value) {
if (x < low_value) return low_value;
if (x > high_value) return high_value;
return x;
}
template <typename T>
class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
......@@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
info->numel_offset = 0; // not determined yet
}
}
const auto &apply_weight_decay =
ctx.Attr<std::vector<int>>("apply_weight_decay");
size_t fp32_wd_end_idx =
ReorderParamGradInfoList(apply_weight_decay, &fp32_infos);
size_t fp16_wd_end_idx =
ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
auto *param_order_t = ctx.Output<framework::Tensor>("ParamOrder");
auto param_num = fp32_infos.size() + fp16_infos.size();
param_order_t->Resize({static_cast<int16_t>(param_num)});
auto *param_order = param_order_t->mutable_data<int>(platform::CPUPlace());
for (size_t i = 0; i < fp32_infos.size(); ++i) {
param_order[i] = static_cast<int>(fp32_infos[i].idx);
}
for (size_t i = 0; i < fp16_infos.size(); ++i) {
param_order[i + fp32_infos.size()] = static_cast<int>(fp16_infos[i].idx);
}
VLOG(10) << "Fill ParamGradInfo ends";
// Step 2: determine the numel_with_padding and numel_offset
......@@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
VLOG(10) << "Found the sharding arguments";
auto *param_info_t = ctx.Output<framework::Tensor>("ParamInfo");
param_info_t->Resize({6});
param_info_t->Resize({8});
auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace());
param_info[0] = static_cast<int>(fp32_start_idx);
param_info[1] = static_cast<int>(fp32_local_param_num);
param_info[2] = static_cast<int>(fp32_infos.size());
param_info[3] = static_cast<int>(fp16_start_idx + fp32_infos.size());
param_info[4] = static_cast<int>(fp16_local_param_num);
param_info[5] = static_cast<int>(fp16_infos.size());
param_info[3] = ClipByBound<int>(fp32_wd_end_idx, fp32_start_idx,
fp32_start_idx + fp32_local_param_num) -
static_cast<int>(fp32_start_idx);
param_info[4] = static_cast<int>(fp16_start_idx + fp32_infos.size());
param_info[5] = static_cast<int>(fp16_local_param_num);
param_info[6] = static_cast<int>(fp16_infos.size());
param_info[7] = ClipByBound<int>(fp16_wd_end_idx, fp16_start_idx,
fp16_start_idx + fp16_local_param_num) -
static_cast<int>(fp16_start_idx);
VLOG(10) << "Start FP32 idx: " << param_info[0];
VLOG(10) << "Local FP32 param num: " << param_info[1];
VLOG(10) << "Global FP32 param num: " << param_info[2];
VLOG(10) << "Start FP16 idx: " << param_info[3];
VLOG(10) << "Local FP16 param num: " << param_info[4];
VLOG(10) << "Global FP16 param num: " << param_info[5];
VLOG(10) << "Start FP16 idx: " << param_info[4];
VLOG(10) << "Local FP16 param num: " << param_info[5];
VLOG(10) << "Global FP16 param num: " << param_info[6];
// For WeightDecay, shard and perform H2D copy
const auto &origin_weight_decay =
ctx.Attr<std::vector<float>>("weight_decay");
PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(),
platform::errors::InvalidArgument(
"The attr(weight_decay) should have the "
"same length with Input(Param)."));
std::vector<float> shard_weight_decay;
shard_weight_decay.reserve(total_local_param_num);
for (size_t i = 0; i < fp32_local_param_num; ++i) {
shard_weight_decay.push_back(
origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]);
}
for (size_t i = 0; i < fp16_local_param_num; ++i) {
shard_weight_decay.push_back(
origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]);
}
// For FusedIndices, launch CUDA kernel to do binary search
auto *fused_indices_t = ctx.Output<framework::Tensor>("FusedIndices");
fused_indices_t->Resize({static_cast<int64_t>(total_numel)});
auto *fused_indices = fused_indices_t->mutable_data<int>(place);
std::vector<int> numel_offsets;
numel_offsets.reserve(params.size() + 1);
for (const auto &info : fp32_infos) {
......@@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
"The numel_offsets number must be one larger than "
"the parameter number."));
VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets);
auto *fused_param_offset_t =
ctx.Output<framework::Tensor>("FusedParamOffsets");
fused_param_offset_t->Resize({static_cast<int64_t>(numel_offsets.size())});
auto *fused_param_offset = fused_param_offset_t->mutable_data<int>(place);
memory::Copy(place, fused_param_offset, platform::CPUPlace(),
numel_offsets.data(),
numel_offsets.size() * sizeof(numel_offsets[0]), stream);
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel);
LambFillFusedIndicesCUDAKernel<<<config.block_per_grid,
config.thread_per_block, 0, stream>>>(
fused_param_offset, fused_indices, numel_offsets.size() - 1,
total_numel);
std::vector<int> lengths;
lengths.reserve(fp32_local_param_num + fp16_local_param_num);
std::vector<int> fp32_partial_numel_offsets;
fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1);
......@@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
VLOG(10) << "FP32 Partial numel = ["
<< valid_start_n + fp32_infos[i].numel << ","
<< end_n + fp32_infos[i].numel;
lengths.push_back(end_n - valid_start_n);
auto len = end_n - valid_start_n;
fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() +
lengths.back());
len);
}
std::vector<int> fp16_partial_numel_offsets;
......@@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
PADDLE_ENFORCE_NE(valid_start_n, end_n,
platform::errors::InvalidArgument(
"Indices sharding error. This may be a bug."));
lengths.push_back(end_n - valid_start_n);
auto len = end_n - valid_start_n;
fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() +
lengths.back());
len);
}
CopyVectorToCPUTensor(numel_offsets,
......@@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
fp16_partial_numel_offsets,
ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));
// Fill the weight decay tensor
PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(),
platform::errors::InvalidArgument(
"Invalid weight decay sharding. This may be a bug."));
std::vector<float> wd_cpu;
for (size_t i = 0; i < shard_weight_decay.size(); ++i) {
int len = lengths[i];
for (int j = 0; j < len; ++j) {
wd_cpu.push_back(shard_weight_decay[i]);
}
}
PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel,
platform::errors::InvalidArgument(
"Invalid weight decay sharding. This may be a bug."));
CopyVectorToTensor(wd_cpu, ctx.Output<framework::Tensor>("WeightDecay"),
place, stream);
auto *global_scale = ctx.Output<framework::Tensor>("GlobalScale");
if (!global_scale->IsInitialized()) {
TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);
......
......@@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
"The fp32 beta1 power accumulator tensor. Its shape is [1].");
AddInput("Beta2Pow",
"The fp32 beta2 power accumulator tensor. Its shape is [1].");
AddInput("FusedIndices",
"The param index of each element in FP32FusedParam. Its shape is "
"[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
AddInput(
"FusedParamOffsets",
"The numel offset of each parameter inside the FP32FusedParam. Its "
"shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
"+ n_2, ...].");
AddInput("FP32ShardFusedParamOffsets",
"+ n_2, ...]. It should be in CPUPlace.");
AddInput(
"FP32ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp32_local_param_num + 1].");
AddInput("FP16ShardFusedParamOffsets",
"Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
AddInput(
"FP16ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp16_local_param_num + 1].");
AddInput("WeightDecay",
"The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
"Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
AddInput("ParamInfo",
"The param info. It should be in CPUPlace, and its shape is [6]"
"CPUPlace, and its shape is [6]. It is "
"CPUPlace, and its shape is [8]. It is "
"[fp32_shard_param_start_idx, fp32_local_param_num, "
"fp32_global_param_num, fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num].");
"fp32_global_param_num, fp32_weight_decay_end_idx, "
"fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num, "
"fp16_weight_decay_end_idx].");
AddInput("ParamOrder",
"The reordered parameter order. Inside this op, "
"the parameter would be reordered by data type and weight decay "
"value.");
AddInput("LearningRate",
"The fp32 learning rate tensor. Its shape is [1].");
......@@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
"max_global_grad_norm",
"The maximum global gradient l2-norm value for clipping. If "
"max_global_grad_norm <= 0, no clipping would be performed.");
AddAttr<float>("weight_decay", "The weight decay value.");
AddAttr<bool>("clip_after_allreduce",
"Whether to clip before allreduce, only valid when the "
"world size is larger than 1.");
......
......@@ -87,7 +87,7 @@ struct L2NormFunctor {
}
};
template <typename InT, typename OutT, int BlockDim, bool NeedSqrt>
template <typename InT, typename OutT, int BlockDim>
static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
const InT *x, OutT *y, int max_chunk_num) {
int tensor_id = blockIdx.x;
......@@ -100,12 +100,8 @@ static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
}
sum = BlockReduce(storage).Reduce(sum, cub::Sum());
if (threadIdx.x == 0) {
if (NeedSqrt) {
y[blockIdx.x] = static_cast<OutT>(sqrtf(sum));
} else {
y[blockIdx.x] = static_cast<OutT>(sum);
}
}
}
template <typename T>
......@@ -118,6 +114,7 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
chunk_size *= sizeof(T);
if (address % vec8 == 0 && chunk_size % vec8 == 0) {
return std::min(8, valid_vec_size);
} else if (address % vec4 == 0 && chunk_size % vec4 == 0) {
......@@ -129,27 +126,26 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
}
}
#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \
#define PD_VEC_LAUNCH_KERNEL_CASE(__vec_size, ...) \
case __vec_size: { \
constexpr int kVecSize = __vec_size; \
__VA_ARGS__; \
break; \
}
#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...) \
#define PD_VEC_LAUNCH_KERNEL(__vec_size, ...) \
do { \
switch (__vec_size) { \
PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \
PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \
PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \
PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \
PD_VEC_LAUNCH_KERNEL_CASE(8, __VA_ARGS__); \
PD_VEC_LAUNCH_KERNEL_CASE(4, __VA_ARGS__); \
PD_VEC_LAUNCH_KERNEL_CASE(2, __VA_ARGS__); \
PD_VEC_LAUNCH_KERNEL_CASE(1, __VA_ARGS__); \
} \
} while (0)
// TODO(zengjinle): which chunk_size is better?
template <typename InT, typename OutT, bool NeedSqrt = false,
int MaxTensorNumPerLaunch = 50, int MaxChunkNumPerLaunch = 680,
int BlockDim = 512>
template <typename InT, typename OutT, int MaxTensorNumPerLaunch = 160,
int MaxChunkNumPerLaunch = 780>
static void MultiTensorL2Norm(const platform::CUDAPlace &place,
gpuStream_t stream, const InT *x,
const int *offsets, int n, OutT *y,
......@@ -158,7 +154,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
constexpr int kNumTensor = MaxTensorNumPerLaunch;
constexpr int kNumChunk = MaxChunkNumPerLaunch;
constexpr int kBlockDim = BlockDim;
constexpr int kBlockDim = 512;
int max_chunk_num = -1;
int vec_size = 8;
......@@ -181,22 +177,22 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);
#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL \
#define PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL \
do { \
using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>; \
VLOG(10) << __func__ << " " << typeid(InT).name() \
<< " VecSize = " << kVecSize; \
MultiTensorApply<FunctorT, kBlockDim, kNumTensor, kNumChunk>( \
FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \
MultiTensorApply<FunctorT, kNumTensor, kNumChunk>( \
FunctorT(), stream, offsets, n, chunk_size, kBlockDim, x, tmp_out_ptr, \
max_chunk_num); \
} while (0)
PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL);
#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL
PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL);
#undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL
MultiTensorL2NormReduceAgainCUDAKernel<MT, OutT, kBlockDim,
NeedSqrt><<<n, kBlockDim, 0, stream>>>(
tmp_out_ptr, y, max_chunk_num);
MultiTensorL2NormReduceAgainCUDAKernel<
MT, OutT, kBlockDim><<<n, kBlockDim, 0, stream>>>(tmp_out_ptr, y,
max_chunk_num);
}
template <int LogLevel>
......@@ -208,34 +204,17 @@ static void LogParamAndTrustRatioDivSquareNorm(
auto tensors = ctx.MultiInput<framework::Tensor>("Param");
if (tensors.empty()) return;
const auto *order = ctx.Input<framework::Tensor>("ParamOrder")->data<int>();
size_t n = tensors.size();
auto place = tensors[0]->place();
auto pn_vec = ToVector(param_square_norm, n, place);
auto tn_vec = ToVector(trust_ratio_div_square_norm, n, place);
std::vector<size_t> fp32_indices, fp16_indices;
fp32_indices.reserve(n);
fp16_indices.reserve(n);
for (size_t i = 0; i < n; ++i) {
const auto *t = tensors[i];
if (t->dtype() == phi::DataType::FLOAT32) {
fp32_indices.push_back(i);
} else if (t->dtype() == phi::DataType::FLOAT16) {
fp16_indices.push_back(i);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unsupported data type %s.", t->dtype()));
}
}
for (auto idx : fp16_indices) {
fp32_indices.push_back(idx);
}
const auto &names = ctx.GetOp().Inputs("Param");
for (size_t i = 0; i < fp32_indices.size(); ++i) {
auto idx = fp32_indices[i];
for (size_t i = 0; i < n; ++i) {
auto idx = order[i];
VLOG(LogLevel) << "Param " << tensors[idx]->dtype() << " " << names[idx]
<< " pn = " << pn_vec[i] << " , tn = " << tn_vec[i];
}
......@@ -353,7 +332,7 @@ static __global__ void CalcGradNormClipBeforeAllReduceScale(
const T1 *__restrict__ global_scale, T1 max_global_grad_norm,
const T1 *__restrict__ square_grad_norm, T1 *__restrict__ out1,
T2 *__restrict__ out2, T1 clip_rescale_grad) {
T1 grad_norm = static_cast<T1>(sqrt(*square_grad_norm)) * clip_rescale_grad;
T1 grad_norm = static_cast<T1>(sqrtf(*square_grad_norm)) * clip_rescale_grad;
T1 scale = global_scale[0] * max_global_grad_norm / (1e-6 + grad_norm);
bool found_nan_inf = !isfinite(scale);
if (scale >= 1 || found_nan_inf) {
......@@ -380,19 +359,24 @@ static __global__ void SetNanInfValueCUDAKernelTwoFlag(const bool *in_flag_p_1,
((*in_flag_p_1) || (*in_flag_p_2)) ? __int_as_float(0x7fffffffU) : 0.0f;
}
// TODO(zengjinle): Vectorize this function
// NOTE: this method does not update Beta1Pow and Beta2Pow!
template <typename T, typename GradT, typename IndexT>
static __global__ void UpdateLambMoment(
template <typename T, typename GradT, int VecSize>
static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
const T *__restrict__ param_p, const GradT *__restrict__ grad_p,
const T *__restrict__ square_grad_norm_p,
const T *__restrict__ global_scale, const IndexT *__restrict__ indices,
const T *__restrict__ weight_decay_p, const T *__restrict__ beta1pow_p,
const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p,
const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p,
T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, T beta1, T beta2,
T epsilon, T max_global_grad_norm, int num, T rescale_grad) {
T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf,
T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon,
T max_global_grad_norm, int num, T rescale_grad) {
T square_grad_norm = *square_grad_norm_p;
if (!isfinite(square_grad_norm)) return;
bool need_update_found_inf =
(found_inf && threadIdx.x == 0 && blockIdx.x == 0);
if (!isfinite(square_grad_norm)) {
if (need_update_found_inf) *found_inf = true;
return;
} else if (need_update_found_inf) {
*found_inf = false;
}
T scale = rescale_grad / global_scale[0];
if (max_global_grad_norm > 0) {
......@@ -406,27 +390,112 @@ static __global__ void UpdateLambMoment(
T one_minus_beta1pow = 1 - beta1pow_p[0];
T one_minus_beta2pow = 1 - beta2pow_p[0];
CUDA_KERNEL_LOOP(i, num) {
T p = param_p[i];
T g = static_cast<T>(grad_p[i]) * scale;
T weight_decay = weight_decay_p[i];
T mom1 = mom1_p[i];
T mom2 = mom2_p[i];
int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
int stride = blockDim.x * gridDim.x * VecSize;
mom1 = beta1 * mom1 + (1 - beta1) * g;
mom2 = beta2 * mom2 + (1 - beta2) * g * g;
for (; i + VecSize <= num; i += stride) {
platform::AlignedVector<T, VecSize> param_vec;
platform::AlignedVector<GradT, VecSize> grad_vec;
platform::AlignedVector<T, VecSize> weight_decay_vec;
platform::AlignedVector<T, VecSize> mom1_vec;
platform::AlignedVector<T, VecSize> mom2_vec;
platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
T mom1_unbiased = mom1 / one_minus_beta1pow;
T mom2_unbiased = mom2 / one_minus_beta2pow;
T trust_ratio_div =
mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + weight_decay * p;
T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
if (cur_weight_decay != static_cast<T>(0.0)) {
platform::Load(param_p + i, &param_vec);
} else {
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
param_vec[j] = static_cast<T>(0);
}
}
platform::Load(grad_p + i, &grad_vec);
platform::Load(mom1_p + i, &mom1_vec);
platform::Load(mom2_p + i, &mom2_vec);
#define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2, \
__trust_ratio_div, __idx) \
T p = __param[__idx]; \
T g = static_cast<T>(__grad[__idx]) * scale; \
T mom1 = __mom1[__idx]; \
T mom2 = __mom2[__idx]; \
mom1 = beta1 * mom1 + (1 - beta1) * g; \
mom2 = beta2 * mom2 + (1 - beta2) * g * g; \
T mom1_unbiased = mom1 / one_minus_beta1pow; \
T mom2_unbiased = mom2 / one_minus_beta2pow; \
__trust_ratio_div[__idx] = \
mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + cur_weight_decay * p; \
__mom1[__idx] = mom1; \
__mom2[__idx] = mom2;
mom1_p[i] = mom1;
mom2_p[i] = mom2;
trust_ratio_div_p[i] = trust_ratio_div;
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_vec, grad_vec, mom1_vec,
mom2_vec, trust_ratio_div_vec, j);
}
platform::Store(mom1_vec, mom1_p + i);
platform::Store(mom2_vec, mom2_p + i);
platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
}
for (; i < num; ++i) {
T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_p, grad_p, mom1_p, mom2_p,
trust_ratio_div_p, i);
}
}
template <typename T, typename GradT>
static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
const T *param_p, const GradT *grad_p, const T *square_grad_norm_p,
const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p,
T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay,
int weight_decay_end_idx, T beta1, T beta2, T epsilon,
T max_global_grad_norm, T rescale_grad) {
if (n <= 0) return;
int numel = offsets[n] - offsets[0];
PADDLE_ENFORCE_GE(weight_decay_end_idx, 0,
platform::errors::InvalidArgument(
"The weight decay end index should be >= 0."));
PADDLE_ENFORCE_LE(weight_decay_end_idx, n,
platform::errors::InvalidArgument(
"The weight decay end index should be < %d.", n));
auto weight_decay_end_numel = offsets[weight_decay_end_idx] - offsets[0];
int vec_size = GetChunkedVecSize(param_p, 0);
vec_size = std::min(vec_size, GetChunkedVecSize(grad_p, 0));
vec_size = std::min(vec_size, GetChunkedVecSize(mom1_p, 0));
vec_size = std::min(vec_size, GetChunkedVecSize(mom2_p, 0));
vec_size = std::min(vec_size, GetChunkedVecSize(trust_ratio_div_p, 0));
for (int i = 0; i < n; ++i) {
auto length = offsets[i + 1] - offsets[i];
while (length % vec_size != 0) {
vec_size /= 2;
}
}
VLOG(1) << __func__ << " VecSize = " << vec_size;
auto stream = dev_ctx.stream();
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \
do { \
UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize><<< \
config.block_per_grid, config.thread_per_block, 0, stream>>>( \
param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \
beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, \
weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \
max_global_grad_norm, numel, rescale_grad); \
} while (0)
PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL);
#undef PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL
}
template <typename T, bool NeedUpdate /*=true*/>
struct LambBetaPowUpdateOnceHelper {
LambBetaPowUpdateOnceHelper(T *beta1pow, T *beta2pow, T beta1, T beta2) {
......@@ -468,33 +537,6 @@ struct LambBetaPowUpdateOnceHelper<T, false> {
HOSTDEVICE void UpdateBetaPows() const {}
};
template <bool HasFoundInf /*=true*/>
struct LambFoundInfHelper {
public:
explicit LambFoundInfHelper(bool *found_inf) : found_inf_(found_inf) {
PADDLE_ENFORCE_NOT_NULL(found_inf,
platform::errors::InvalidArgument(
"The found_inf should not be nullptr."));
}
HOSTDEVICE void UpdateFoundInf(bool value) { *found_inf_ = value; }
private:
bool *__restrict__ found_inf_;
};
template <>
struct LambFoundInfHelper<false> {
public:
explicit LambFoundInfHelper(bool *found_inf) {
PADDLE_ENFORCE_EQ(
found_inf, nullptr,
platform::errors::InvalidArgument("The found_inf should be nullptr."));
}
HOSTDEVICE void UpdateFoundInf(bool) {}
};
template <typename T, bool HasMasterParam /*=true*/>
struct LambParamHelper {
LambParamHelper(T *param, MasterT<T> *master_param) {
......@@ -509,12 +551,9 @@ struct LambParamHelper {
master_param_ = master_param;
}
HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) {
param_[i] = static_cast<T>(updated_p);
master_param_[i] = updated_p;
}
HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
HOSTDEVICE MasterT<T> GetParam(int i) { return master_param_[i]; }
HOSTDEVICE MasterT<T> *__restrict__ MasterParamPtr() { return master_param_; }
private:
T *__restrict__ param_;
......@@ -538,158 +577,169 @@ struct LambParamHelper<T, false> {
param_ = param;
}
HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) {
param_[i] = static_cast<T>(updated_p);
}
HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
HOSTDEVICE MasterT<T> GetParam(int i) {
return static_cast<MasterT<T>>(param_[i]);
}
HOSTDEVICE constexpr MasterT<T> *MasterParamPtr() { return nullptr; }
private:
T *__restrict__ param_;
};
template <typename ParamT, typename IndexT, bool HasMasterParam,
bool NeedUpdateBetaPow, bool HasFoundInf>
struct LambParamAndBetaPowsUpdateHelper
: public LambParamHelper<ParamT, HasMasterParam>,
public LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>,
public LambFoundInfHelper<HasFoundInf> {
LambParamAndBetaPowsUpdateHelper(
ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
bool *found_inf, const MasterT<ParamT> *trust_ratio_div,
const MasterT<ParamT> *lr, const IndexT *index,
template <typename ParamT, bool HasMasterParam, bool NeedUpdateBetaPow,
int VecSize>
struct LambUpdateParamAndBetaPowsFunctor {
DEVICE void operator()(
int tensor_id, int chunk_id, int offset, int size,
LambParamHelper<ParamT, HasMasterParam> param_helper,
const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
const MasterT<ParamT> *param_square_norm,
const MasterT<ParamT> *trust_ratio_div_square_norm,
const MasterT<ParamT> *update_flag)
: LambParamHelper<ParamT, HasMasterParam>(param, master_param),
LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>(
beta1pow, beta2pow, beta1, beta2),
LambFoundInfHelper<HasFoundInf>(found_inf),
trust_ratio_div(trust_ratio_div),
lr(lr),
index(index),
param_square_norm(param_square_norm),
trust_ratio_div_square_norm(trust_ratio_div_square_norm),
update_flag(update_flag) {}
const MasterT<ParamT> *__restrict__ trust_ratio_div;
const MasterT<ParamT> *__restrict__ lr;
const IndexT *__restrict__ index;
const MasterT<ParamT> *__restrict__ param_square_norm;
const MasterT<ParamT> *__restrict__ trust_ratio_div_square_norm;
const MasterT<ParamT> *__restrict__ update_flag;
};
template <typename ParamT, typename IndexT, bool HasMasterParam,
bool NeedUpdateBetaPow, bool HasFoundInf>
static __global__ void LambUpdateParamAndBetaPowsCUDAKernel(
LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, HasMasterParam,
NeedUpdateBetaPow, HasFoundInf>
args,
int num) {
auto should_update = *args.update_flag;
if (!isfinite(should_update)) {
if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) {
args.UpdateFoundInf(true);
}
return;
} else if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) {
args.UpdateFoundInf(false);
}
if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) {
args.UpdateBetaPows();
}
const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>
betapow_helper) const {
if (*found_inf) return;
using MT = MasterT<ParamT>;
MT lr_value = *args.lr;
CUDA_KERNEL_LOOP(i, num) {
MT p = args.GetParam(i);
MT t = args.trust_ratio_div[i];
auto norm_idx = args.index[i];
MT p_square_norm = args.param_square_norm[norm_idx];
MT t_square_norm = args.trust_ratio_div_square_norm[norm_idx];
MT p_norm = static_cast<MT>(sqrtf(p_square_norm));
MT t_norm = static_cast<MT>(sqrtf(t_square_norm));
MT p_square_norm = param_square_norm[tensor_id];
MT t_square_norm = trust_ratio_div_square_norm[tensor_id];
MT lr_value = *lr;
MT ratio = (p_square_norm != static_cast<MT>(0) &&
t_square_norm != static_cast<MT>(0)
? lr_value * sqrtf(p_square_norm / t_square_norm)
: lr_value);
auto update = (p_norm != static_cast<MT>(0) && t_norm != static_cast<MT>(0))
? p_norm / t_norm
: static_cast<MT>(1);
MT updated_p = p - lr_value * update * t;
args.SetParam(i, updated_p);
int i;
int stride = blockDim.x * VecSize;
ParamT *param = param_helper.ParamPtr() + offset;
MT *master_param = HasMasterParam ? param_helper.MasterParamPtr() + offset
: param_helper.MasterParamPtr();
trust_ratio_div += offset;
for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) {
platform::AlignedVector<MT, VecSize> trust_ratio_div_vec;
platform::Load(trust_ratio_div + i, &trust_ratio_div_vec);
if (HasMasterParam) {
platform::AlignedVector<MT, VecSize> master_param_vec;
platform::Load(master_param + i, &master_param_vec);
platform::AlignedVector<ParamT, VecSize> param_vec;
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j];
master_param_vec[j] = p;
param_vec[j] = static_cast<ParamT>(p);
}
}
template <typename ParamT, typename IndexT>
static void LambUpdateParamAndBetaPows(
const platform::CUDADeviceContext &dev_ctx,
const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
const IndexT *index, const MasterT<ParamT> *param_square_norm,
const MasterT<ParamT> *trust_ratio_div_square_norm,
const MasterT<ParamT> *update_flag, MasterT<ParamT> **beta1pow,
MasterT<ParamT> **beta2pow, bool **found_inf, MasterT<ParamT> beta1,
MasterT<ParamT> beta2, int num, ParamT *param,
MasterT<ParamT> *master_param, gpuStream_t stream) {
if (num == 0) return;
bool has_master_param = !(std::is_same<ParamT, MasterT<ParamT>>::value);
auto has_beta_pow = (*beta1pow) != nullptr && (*beta2pow) != nullptr;
auto has_found_inf = (*found_inf) != nullptr;
#define PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL( \
__has_master_param, __has_beta_pow, __has_found_inf) \
do { \
LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, __has_master_param, \
__has_beta_pow, __has_found_inf> \
helper(param, master_param, *beta1pow, *beta2pow, beta1, beta2, \
*found_inf, trust_ratio_div, lr, index, param_square_norm, \
trust_ratio_div_square_norm, update_flag); \
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, num); \
LambUpdateParamAndBetaPowsCUDAKernel<<< \
config.block_per_grid, config.thread_per_block, 0, stream>>>(helper, \
num); \
} while (0)
if (has_master_param) {
if (has_beta_pow) {
if (has_found_inf) {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, true);
platform::Store(master_param_vec, master_param + i);
platform::Store(param_vec, param + i);
} else {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, false);
platform::AlignedVector<ParamT, VecSize> param_vec;
platform::Load(param + i, &param_vec);
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
MT p = static_cast<MT>(param_vec[j]) - ratio * trust_ratio_div_vec[j];
param_vec[j] = static_cast<ParamT>(p);
}
} else {
if (has_found_inf) {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, true);
} else {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, false);
platform::Store(param_vec, param + i);
}
}
for (; i < size; ++i) {
if (HasMasterParam) {
MT p = master_param[i] - ratio * trust_ratio_div[i];
master_param[i] = p;
param[i] = static_cast<ParamT>(p);
} else {
if (has_beta_pow) {
if (has_found_inf) {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, true);
} else {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, false);
MT p = static_cast<MT>(param[i]) - ratio * trust_ratio_div[i];
param[i] = static_cast<ParamT>(p);
}
}
if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) {
betapow_helper.UpdateBetaPows();
}
}
};
// TODO(zengjinle): which block_dim and chunk_size would be better?
template <typename ParamT, int MaxTensorNumPerLaunch = 160,
int MaxChunkNumPerLaunch = 780>
static void MultiTensorUpdateLambParamAndBetaPows(
const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
const MasterT<ParamT> *param_square_norm,
const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
int chunk_size = 65536) {
constexpr bool kHasMasterParam =
!(std::is_same<ParamT, MasterT<ParamT>>::value);
bool has_beta_pow = (beta1pow != nullptr);
if (has_beta_pow) {
PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument(
"Beta2Pow should not be nullptr."));
} else {
if (has_found_inf) {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, true);
} else {
PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, false);
PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument(
"Beta2Pow should be nullptr."));
}
const int block_dim = 512;
int vec_size = 8;
for (int i = 0; i < n; ++i) {
int offset = offsets[i] - offsets[0];
vec_size =
std::min(vec_size, GetChunkedVecSize(param + offset, chunk_size));
if (kHasMasterParam) {
vec_size = std::min(vec_size,
GetChunkedVecSize(master_param + offset, chunk_size));
}
vec_size = std::min(
vec_size, GetChunkedVecSize(trust_ratio_div + offset, chunk_size));
}
*beta1pow = nullptr;
*beta2pow = nullptr;
*found_inf = nullptr;
#undef PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL
VLOG(1) << __func__ << " VecSize = " << vec_size;
constexpr auto kNumTensor = MaxTensorNumPerLaunch;
constexpr auto kNumChunk = MaxChunkNumPerLaunch;
auto stream = dev_ctx.stream();
#define PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(__has_beta_pow) \
do { \
using FunctorT = \
LambUpdateParamAndBetaPowsFunctor<ParamT, kHasMasterParam, \
__has_beta_pow, kVecSize>; \
LambParamHelper<ParamT, kHasMasterParam> param_helper(param, \
master_param); \
LambBetaPowUpdateOnceHelper<MasterT<ParamT>, __has_beta_pow> \
betapow_helper(beta1pow, beta2pow, beta1, beta2); \
launcher.Launch(FunctorT(), param_helper, trust_ratio_div, lr, \
param_square_norm, trust_ratio_div_square_norm, found_inf, \
betapow_helper); \
} while (0)
#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE \
do { \
auto callback = [&]( \
const MultiTensorLauncher<kNumTensor, kNumChunk> &launcher, \
int launch_n) { \
if (has_beta_pow && launch_n == 0) { \
PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true); \
beta1pow = nullptr; \
beta2pow = nullptr; \
} else { \
PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false); \
} \
}; \
MultiTensorApplyWithCallback<kNumTensor, kNumChunk>( \
stream, offsets, n, chunk_size, block_dim, callback); \
} while (0)
PD_VEC_LAUNCH_KERNEL(vec_size,
PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE);
#undef PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW
#undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......@@ -1005,15 +1055,16 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
"Too many parameter number. Only <= %d is supported.",
std::numeric_limits<int>::max()));
// Step 3: Get FusedIndices, ParamInfo
const auto *indices = GetInputTensorPtr<int>(ctx, "FusedIndices");
// Step 3: Get ParamInfo
const auto *param_info_tensor = GetInputTensorPtr<int>(ctx, "ParamInfo");
auto fp32_local_start_idx = param_info_tensor[0];
auto fp32_local_param_num = param_info_tensor[1];
auto fp32_global_param_num = param_info_tensor[2];
auto fp16_local_start_idx = param_info_tensor[3];
auto fp16_local_param_num = param_info_tensor[4];
auto fp16_global_param_num = param_info_tensor[5];
auto fp32_weight_decay_end_idx = param_info_tensor[3];
auto fp16_local_start_idx = param_info_tensor[4];
auto fp16_local_param_num = param_info_tensor[5];
auto fp16_global_param_num = param_info_tensor[6];
auto fp16_weight_decay_end_idx = param_info_tensor[7];
auto local_param_num = fp32_local_param_num + fp16_local_param_num;
auto param_num = fp32_global_param_num + fp16_global_param_num;
......@@ -1031,7 +1082,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
<< " , fp16_global_param_num = " << fp16_global_param_num;
// Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow,
// WeightDecay, GlobalScale, FoundInf
// GlobalScale, FoundInf
const auto *global_scale = GetInputTensorPtr<float>(ctx, "GlobalScale");
const auto *lr = GetInputTensorPtr<float>(ctx, "LearningRate");
int64_t partial_numel = 0;
......@@ -1065,14 +1116,15 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
GetSameInOutTensorPtr<float>(ctx, place, "Beta1Pow", "Beta1PowOut");
auto *beta2pow =
GetSameInOutTensorPtr<float>(ctx, place, "Beta2Pow", "Beta2PowOut");
const float *weight_decay = GetInputTensorPtr<float>(ctx, "WeightDecay");
auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
found_inf_t->Resize({1});
auto *found_inf = found_inf_t->mutable_data<bool>(place);
// Step 5: Get attributes beta1, beta2, epsilon, max_grad_norm, ring_id,
// Step 5: Get attributes weight_decay, beta1, beta2, epsilon,
// max_grad_norm, ring_id,
// use_master_param_norm, is_grad_scaled_by_nranks
auto weight_decay = ctx.Attr<float>("weight_decay");
auto beta1 = ctx.Attr<float>("beta1");
auto beta2 = ctx.Attr<float>("beta2");
auto epsilon = ctx.Attr<float>("epsilon");
......@@ -1105,7 +1157,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
platform::float16 *fp16_sum_grad;
auto fp32_numel_each_device = fp32_numel / num_devices;
auto fp16_numel_each_device = fp16_numel / num_devices;
if (num_devices > 1) {
if (num_devices > 1 ||
(max_global_grad_norm > 0 && !clip_after_allreduce)) {
auto ptr = sum_grad_buffer.Alloc<uint8_t>(
fp32_numel_each_device * sizeof(float) +
fp16_numel_each_device * sizeof(platform::float16));
......@@ -1181,7 +1234,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
float, platform::float16><<<1, 1, 0, stream>>>(
global_scale, max_global_grad_norm, fp32_square_grad_norm,
fp32_scale, fp16_scale, clip_scale);
if (fp32_scale) {
VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
} else {
VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place);
}
if (num_devices > 1) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32,
......@@ -1218,36 +1275,56 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
VLOG(10) << "ReduceScatter done";
// Step 7: update the moment1, moment2. Calcuate the trust_ratio_div
auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
auto *fused_offsets = fused_offsets_t->data<int>();
auto *fp32_partial_fused_offsets_t =
ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
const auto *fp32_partial_fused_offsets =
fp32_partial_fused_offsets_t->data<int>();
auto *fp16_partial_fused_offsets_t =
ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
const auto *fp16_partial_fused_offsets =
fp16_partial_fused_offsets_t->data<int>();
VLOG(1) << "FusedParamOffsets: "
<< FlattenToString(fused_offsets, fused_offsets_t->numel(),
fused_offsets_t->place());
VLOG(1) << "FP32ShardFusedParamOffsets: "
<< FlattenToString(fp32_partial_fused_offsets,
fp32_partial_fused_offsets_t->numel(),
fp32_partial_fused_offsets_t->place());
VLOG(1) << "FP16ShardFusedParamOffsets: "
<< FlattenToString(fp16_partial_fused_offsets,
fp16_partial_fused_offsets_t->numel(),
fp16_partial_fused_offsets_t->place());
memory::Buffer trust_ratio_div_buffer(place);
auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel);
auto fp32_offset = rank * fp32_numel_each_device;
auto fp16_offset = rank * fp16_numel_each_device;
if (has_fp32_param) {
auto config =
platform::GetGpuLaunchConfig1D(dev_ctx, fp32_numel_each_device);
VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts";
UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0,
stream>>>(
MultiTensorUpdateLambMomentAndTrustRatioDiv(
dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm,
global_scale, indices + fp32_offset, weight_decay, beta1pow, beta2pow,
moment1, moment2, trust_ratio_div, beta1, beta2, epsilon,
max_global_grad_norm, fp32_numel_each_device, rescale_grad);
global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div,
found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2,
epsilon, max_global_grad_norm, rescale_grad);
VLOG(10) << "Update FP32 Moment and TrustRatioDiv done";
}
float *master_param = nullptr;
if (has_fp16_param) {
master_param = fp32_param + fp32_numel;
auto config =
platform::GetGpuLaunchConfig1D(dev_ctx, fp16_numel_each_device);
VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts";
UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0,
stream>>>(
auto tmp_found_inf = has_fp32_param ? nullptr : found_inf;
MultiTensorUpdateLambMomentAndTrustRatioDiv(
dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm,
global_scale, indices + fp32_numel + fp16_offset, weight_decay,
beta1pow, beta2pow, moment1 + fp32_numel_each_device,
global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device,
moment2 + fp32_numel_each_device,
trust_ratio_div + fp32_numel_each_device, beta1, beta2, epsilon,
max_global_grad_norm, fp16_numel_each_device, rescale_grad);
trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay,
fp16_weight_decay_end_idx, beta1, beta2, epsilon,
max_global_grad_norm, rescale_grad);
VLOG(10) << "Update FP16 Moment and TrustRatioDiv done";
}
......@@ -1257,30 +1334,6 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
memory::Buffer square_norm_buffer(place);
auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num);
auto *trust_ratio_div_square_norm = param_square_norm + param_num;
auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
auto *fused_offsets = fused_offsets_t->data<int>();
auto *fp32_partial_fused_offsets_t =
ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
const auto *fp32_partial_fused_offsets =
fp32_partial_fused_offsets_t->data<int>();
auto *fp16_partial_fused_offsets_t =
ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
const auto *fp16_partial_fused_offsets =
fp16_partial_fused_offsets_t->data<int>();
VLOG(1) << "FusedParamOffsets: "
<< FlattenToString(fused_offsets, fused_offsets_t->numel(),
fused_offsets_t->place());
VLOG(1) << "FP32ShardFusedParamOffsets: "
<< FlattenToString(fp32_partial_fused_offsets,
fp32_partial_fused_offsets_t->numel(),
fp32_partial_fused_offsets_t->place());
VLOG(1) << "FP16ShardFusedParamOffsets: "
<< FlattenToString(fp16_partial_fused_offsets,
fp16_partial_fused_offsets_t->numel(),
fp16_partial_fused_offsets_t->place());
if (num_devices > 1) {
if (use_master_param_norm) {
FillZeroWithPtr(param_square_norm + fp32_global_param_num,
......@@ -1296,11 +1349,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
fp16_partial_fused_offsets, fp16_local_param_num,
param_square_norm + fp16_local_start_idx);
} else {
// NOTE: extra computation is performed. We can improve this performance
// if needed in the future.
MultiTensorL2Norm(
place, stream, fp16_param, fused_offsets + fp32_global_param_num,
fp16_global_param_num, param_square_norm + fp32_global_param_num);
place, stream, fp16_param + fused_offsets[fp16_local_start_idx] -
fused_offsets[fp32_global_param_num],
fused_offsets + fp16_local_start_idx, fp16_local_param_num,
param_square_norm + fp16_local_start_idx);
}
MultiTensorL2Norm(place, stream, trust_ratio_div,
......@@ -1333,26 +1386,29 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
// Step 9: update parameter, beta1pow, beta2pow. All gather parameters.
if (has_fp32_param) {
LambUpdateParamAndBetaPows<float>(
dev_ctx, trust_ratio_div, lr, indices + fp32_offset,
param_square_norm, trust_ratio_div_square_norm, fp32_square_grad_norm,
&beta1pow, &beta2pow, &found_inf, beta1, beta2,
fp32_numel_each_device, fp32_param + fp32_offset, nullptr, stream);
MultiTensorUpdateLambParamAndBetaPows<float>(
dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
trust_ratio_div, lr, param_square_norm + fp32_local_start_idx,
trust_ratio_div_square_norm + fp32_local_start_idx, found_inf,
fp32_param + fp32_offset, nullptr, beta1pow, beta2pow, beta1, beta2);
if (num_devices > 1) {
// ncclAllGather
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
fp32_param + fp32_offset, fp32_param, fp32_numel_each_device,
ncclFloat32, comm, stream));
}
beta1pow = nullptr;
beta2pow = nullptr;
}
if (has_fp16_param) {
LambUpdateParamAndBetaPows<platform::float16>(
dev_ctx, trust_ratio_div + fp32_numel_each_device, lr,
indices + fp32_numel + fp16_offset, param_square_norm,
trust_ratio_div_square_norm, fp32_square_grad_norm, &beta1pow,
&beta2pow, &found_inf, beta1, beta2, fp16_numel_each_device,
fp16_param + fp16_offset, master_param + fp16_offset, stream);
MultiTensorUpdateLambParamAndBetaPows<platform::float16>(
dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
trust_ratio_div + fp32_numel_each_device, lr,
param_square_norm + fp16_local_start_idx,
trust_ratio_div_square_norm + fp16_local_start_idx, found_inf,
fp16_param + fp16_offset, master_param + fp16_offset, beta1pow,
beta2pow, beta1, beta2);
if (num_devices > 1) {
// ncclAllGather
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
......
......@@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel(
args...);
}
template <typename Functor, int BlockDim, int MaxTensorNumPerLaunch,
int MaxChunkNumPerLaunch, typename... Args>
static void MultiTensorApply(Functor functor, gpuStream_t stream,
const int *offsets, int n, int chunk_size,
Args... args) {
template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch>
class MultiTensorLauncher {
public:
MultiTensorLauncher(
const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta,
const int &chunk_id, const int &chunk_size, const int &block_dim,
const gpuStream_t &stream)
: meta_(meta),
chunk_id_(chunk_id),
chunk_size_(chunk_size),
block_dim_(block_dim),
stream_(stream) {}
template <typename Functor, typename... Args>
void Launch(Functor &&functor, Args &&... args) const {
MultiTensorApplyCUDAKernel<
Functor, MaxTensorNumPerLaunch,
MaxChunkNumPerLaunch><<<chunk_id_, block_dim_, 0, stream_>>>(
functor, meta_, chunk_size_, args...);
}
private:
const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta_;
const int &chunk_id_;
const int &chunk_size_;
const int &block_dim_;
const gpuStream_t &stream_;
};
template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
typename Callback>
static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets,
int n, int chunk_size, int block_dim,
Callback &&callback) {
if (n == 0) return;
constexpr auto NumTensor = MaxTensorNumPerLaunch;
......@@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
int numel_offset = 0;
metas.start_tensor_id = 0;
metas.start_chunk_id = 0;
int launch_num = 0;
MultiTensorLauncher<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> launcher(
metas, chunk_id, chunk_size, block_dim, stream);
for (int i = 0; i < n; ++i) {
auto length = offsets[i + 1] - offsets[i];
if (tensor_id == 0) {
......@@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
bool last_chunk = (i + 1 == n && j + 1 == chunk_num);
if (tensor_full || block_full || last_chunk) {
MultiTensorApplyCUDAKernel<Functor, NumTensor,
NumChunk><<<chunk_id, BlockDim, 0, stream>>>(
functor, metas, chunk_size, args...);
callback(launcher, launch_num);
++launch_num;
chunk_id = 0;
if (j + 1 == chunk_num) { // chunk for the current tensor is full
metas.start_chunk_id = 0;
......@@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
}
}
template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
typename... Args>
static void MultiTensorApply(Functor functor, gpuStream_t stream,
const int *offsets, int n, int chunk_size,
int block_dim, Args &&... args) {
auto callback = [&](const MultiTensorLauncher<MaxTensorNumPerLaunch,
MaxChunkNumPerLaunch> &launcher,
int i) { launcher.Launch(functor, args...); };
MultiTensorApplyWithCallback<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch>(
stream, offsets, n, chunk_size, block_dim, callback);
}
} // namespace operators
} // namespace paddle
......@@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) {
__device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) {
return static_cast<platform::float16>(abs(static_cast<float>(x)));
}
__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) {
return static_cast<platform::bfloat16>(abs(static_cast<float>(x)));
}
__device__ __forceinline__ float inline_abs(float x) { return abs(x); }
__device__ __forceinline__ double inline_abs(double x) { return abs(x); }
......@@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow(
return static_cast<platform::float16>(
pow(static_cast<float>(base), static_cast<float>(exponent)));
}
__device__ __forceinline__ platform::bfloat16 inline_pow(
platform::bfloat16 base, platform::bfloat16 exponent) {
return static_cast<platform::bfloat16>(
pow(static_cast<float>(base), static_cast<float>(exponent)));
}
__device__ __forceinline__ float inline_pow(float base, float exponent) {
return pow(base, exponent);
}
......@@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(p_norm,
ops::PnormCUDAKernel<CUDA, paddle::platform::float16>,
ops::PnormCUDAKernel<CUDA, paddle::platform::bfloat16>,
ops::PnormCUDAKernel<CUDA, float>,
ops::PnormCUDAKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(
p_norm_grad, ops::PnormGradCUDAKernel<CUDA, paddle::platform::float16>,
ops::PnormGradCUDAKernel<CUDA, paddle::platform::bfloat16>,
ops::PnormGradCUDAKernel<CUDA, float>,
ops::PnormGradCUDAKernel<CUDA, double>);
......@@ -23,6 +23,7 @@ REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
CUDAReduceSumGradKernel<paddle::platform::float16>,
CUDAReduceSumGradKernel<paddle::platform::bfloat16>,
CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
......@@ -15,6 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/split_op.h"
#include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
using framework::Tensor;
......@@ -23,52 +26,6 @@ class SplitOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of SplitOp should not be null."));
PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
platform::errors::InvalidArgument(
"Outputs(Out) of SplitOp should not be empty."));
auto in_dims = ctx->GetInputDim("X");
auto outs_names = ctx->Outputs("Out");
size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
std::vector<int> sections = static_cast<std::vector<int>>(
ctx->Attrs().Get<std::vector<int>>("sections"));
const size_t outs_number = outs_names.size();
if (sections.size() > 0) {
PADDLE_ENFORCE_EQ(
sections.size(), outs_number,
platform::errors::InvalidArgument("tensor split sections size "
"should be equal to output size."));
}
if (ctx->HasInput("AxisTensor")) {
auto out_dims = phi::make_ddim(std::vector<int>(in_dims.size(), -1));
std::vector<framework::DDim> outs_dims(outs_number, out_dims);
ctx->SetOutputsDim("Out", outs_dims);
for (size_t i = 0; i < outs_number; ++i) {
ctx->ShareLoD("X", "Out", 0, i);
}
return;
}
bool each_section_is_known =
(sections.size() > 0 && !ctx->HasInputs("SectionsTensorList"));
auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known,
in_dims, num, sections, axis, outs_number);
ctx->SetOutputsDim("Out", outs_dims);
if (axis != 0) {
// Only pass LoD when not spliting along the first dim.
for (size_t i = 0; i < outs_number; ++i) {
ctx->ShareLoD("X", "Out", 0, i);
}
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
......@@ -168,6 +125,10 @@ Example:
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor,
PT_INFER_META(phi::SplitInferMeta));
REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker,
ops::SplitGradMaker<paddle::framework::OpDesc>,
ops::SplitGradMaker<paddle::imperative::OpBase>);
ops::SplitGradMaker<paddle::imperative::OpBase>,
SplitInferShapeFunctor);
......@@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
......@@ -281,10 +281,6 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
paddle::operators::UniformRandomOpVarTypeInference);
REGISTER_OP_CPU_KERNEL(
uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
paddle::operators::CPUUniformRandomKernel<double>,
paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
uniform_random_batch_size_like,
paddle::operators::CPUUniformRandomKernel<float>,
......
......@@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(uniform_random,
paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>);
REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>);
......@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/where_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
......@@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "Where");
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Where");
OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Where");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Where");
auto cond_dims = ctx->GetInputDim("Condition");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(
cond_dims, x_dims,
platform::errors::InvalidArgument(
"The dims of Inputs(Condition) and Inputs(X) should be same. "
"But received Condition's shape is [%s], X's shape is [%s]",
cond_dims, x_dims));
PADDLE_ENFORCE_EQ(x_dims, y_dims,
platform::errors::InvalidArgument(
"The dims of Inputs(X) and Inputs(Y) should be same. "
"But received X's shape is [%s], Y's shape is [%s]",
x_dims, y_dims));
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
} // namespace paddle
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
PT_INFER_META(phi::WhereInferMeta));
REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
ops::WhereOpGradMaker<paddle::framework::OpDesc>,
ops::WhereOpGradMaker<paddle::imperative::OpBase>);
ops::WhereOpGradMaker<paddle::imperative::OpBase>,
WhereInferShapeFunctor);
REGISTER_OPERATOR(where_grad, ops::WhereGradOp,
ops::WhereGradNoNeedBufferVarsInferer);
REGISTER_OP_CPU_KERNEL(
where, ops::WhereKernel<paddle::platform::CPUDeviceContext, float>,
ops::WhereKernel<paddle::platform::CPUDeviceContext, double>,
ops::WhereKernel<paddle::platform::CPUDeviceContext, int>,
ops::WhereKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
where_grad, ops::WhereGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::WhereGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/operators/where_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
namespace platform = paddle::platform;
namespace paddle {
namespace operators {
template <typename T>
struct CondFunctor {
HOSTDEVICE inline CondFunctor() {}
HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const {
return cond ? x : y;
}
};
template <typename T>
__global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x,
const T* y, T* out) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < N; idx += blockDim.x * gridDim.x) {
out[idx] = cond[idx] ? x[idx] : y[idx];
}
}
template <typename T>
__global__ void WhereGradCUDAKernel(const int N, const T* dout,
const bool* cond, T* dx, T* dy) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < N; idx += blockDim.x * gridDim.x) {
if (dx != nullptr) {
dx[idx] = cond[idx] ? dout[idx] : 0.;
}
if (dy != nullptr) {
dy[idx] = cond[idx] ? 0. : dout[idx];
}
}
}
template <typename T>
class WhereKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::Tensor>("Condition");
auto* X = context.Input<framework::Tensor>("X");
auto* Y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
auto numel = condition->numel();
// TODO(GaaoWei8): Input of where can be broadcast
const bool* cond_data = condition->data<bool>();
const T* x_data = X->data<T>();
const T* y_data = Y->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
auto stream = context.cuda_device_context().stream();
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
auto functor = CondFunctor<T>();
std::vector<const framework::Tensor*> ins = {condition, X, Y};
std::vector<framework::Tensor*> outs = {out};
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
};
template <typename T>
class WhereGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::Tensor>("Condition");
const bool* cond_data = condition->data<bool>();
auto numel = condition->numel();
auto* dout_t =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
auto* dout = dout_t->data<T>();
T* dx =
(dx_t != nullptr) ? dx_t->mutable_data<T>(context.GetPlace()) : nullptr;
T* dy =
(dy_t != nullptr) ? dy_t->mutable_data<T>(context.GetPlace()) : nullptr;
auto stream = context.cuda_device_context().stream();
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
auto config = GetGpuLaunchConfig1D(dev_ctx, condition->numel());
WhereGradCUDAKernel<
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
numel, dout, cond_data, dx, dy);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
where, paddle::operators::WhereKernel<platform::CUDADeviceContext, float>,
paddle::operators::WhereKernel<platform::CUDADeviceContext, double>,
paddle::operators::WhereKernel<platform::CUDADeviceContext, int>,
paddle::operators::WhereKernel<platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
where_grad,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, float>,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, double>,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int>,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int64_t>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class WhereKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::Tensor>("Condition");
auto* X = context.Input<framework::Tensor>("X");
auto* Y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
const bool* cond_data = condition->data<bool>();
const T* x_data = X->data<T>();
const T* y_data = Y->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
auto x_numel = X->numel();
for (int i = 0; i < x_numel; i++) {
out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
}
}
};
template <typename DeviceContext, typename T>
class WhereGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::LoDTensor>("Condition");
const auto* cond_data = condition->data<bool>();
auto numel = condition->numel();
auto* dout_t =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
auto* dout = dout_t->data<T>();
if (dx_t != nullptr) {
auto* dx = dx_t->mutable_data<T>(context.GetPlace());
for (int i = 0; i < numel; i++) {
dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
}
}
if (dy_t != nullptr) {
auto* dy = dy_t->mutable_data<T>(context.GetPlace());
for (int i = 0; i < numel; i++) {
dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/where_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -14,7 +14,7 @@
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/where_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include <hip/hip_runtime.h>
#endif
#include <stdio.h>
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/float16.h"
......@@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
#endif
#endif
// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
bfloat16 low_half;
// the bfloat16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
return (val & 0xFFFF0000u) | low_half.x;
}
inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
bfloat16 high_half;
// the bfloat16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}
#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
return *reinterpret_cast<bfloat16 *>(&x);
}
static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
return *reinterpret_cast<__nv_bfloat16 *>(&x);
}
CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
PDBF16ToCUDABF16(val)));
}
#else
CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
// concrete packed bfloat16 value may exsits in lower or higher 16bits
// of the 32bits address.
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t sum;
uint32_t newval;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// the bfloat16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed,
bf16_add_to_low_half(assumed, val_f));
} while (old != assumed);
bfloat16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// the bfloat16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed,
bf16_add_to_high_half(assumed, val_f));
} while (old != assumed);
bfloat16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif
CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
float *real = reinterpret_cast<float *>(address);
float *imag = real + 1;
......
......@@ -81,7 +81,7 @@ set(PYBIND_SRCS
cuda_streams_py.cc)
if(NOT ON_INFER)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
if (WITH_NCCL)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
endif()
......
......@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/distributed/collective/reducer.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/imperative/layer.h"
......@@ -143,6 +144,19 @@ void BindDistributed(py::module *m) {
[](distributed::ProcessGroupStrategy &self, int nrings) {
self.nrings_ = nrings;
});
m->def("eager_assign_group_by_size",
[](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
std::vector<size_t> group_size_limits,
std::vector<int64_t> tensor_indices) {
auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
return distributed::Eager_AssignGroupBySize(
tensors, is_sparse_gradient, group_size_limits, tensor_indices);
},
py::arg("tensors"), py::arg("is_sparse_gradient"),
py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
py::arg("tensor_indices") = std::vector<int64_t>{},
py::call_guard<py::gil_scoped_release>());
}
} // end namespace pybind
......
......@@ -60,7 +60,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
{"merged_momentum",
{"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
{"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
{"sparse_momentum",
{"Param", "Grad", "Velocity", "Index", "LearningRate", "MasterParam"}},
{"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
{"run_program", {"X", "Params"}},
{"fused_feedforward",
......@@ -124,7 +125,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
{"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"rnn", {"DropoutState", "Reserve", "Out", "State"}},
{"run_program", {"DOut"}},
{"adam",
......@@ -181,7 +182,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
"out_old_num_accumulates", "out_num_updates"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"batch_norm", {"MeanOut", "VarianceOut"}},
{"sync_batch_norm", {"MeanOut", "VarianceOut"}},
{"accuracy", {"Correct", "Total"}},
......
......@@ -40,6 +40,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
);
}
// Type Constrait for concrete DenseTensor type.
class DenseTensor<string target, string precision, string layout> :
Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">,
"!infrt.DenseTensor<"#target#","#precision#","#layout#">",
"::infrt::DenseTensorType">;
// Base class for infrt dialect attributes.
class Infrt_Attr<string name, list<Trait> traits = [],
string baseCppClass = "::mlir::Attribute">
......
......@@ -21,8 +21,8 @@
#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
#include "paddle/infrt/dialect/infrt_base.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/phi_base.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include "paddle/infrt/dialect/tensor_shape.h"
namespace infrt {
......
......@@ -2,16 +2,7 @@ if (NOT INFRT_WITH_PHI)
return()
endif()
#mlir_tablegen_on(infrt_phi_base DIALECT phi)
add_mlir_dialect(infrt_phi_base phi)
add_mlir_dialect(infrt_phi_tensor phi_dt)
add_mlir_dialect(infrt_phi_kernel phi_kernel)
#mlir_tablegen_on(infrt_phi_tensor)
gather_srcs(infrt_src SRCS
phi_base.cc infrt_phi_tensor.cc
infrt_phi_tensor.cc)
add_subdirectory(ir)
add_subdirectory(pass)
add_executable(phi-exec phi_exec.cc)
......
#mlir_tablegen_on(infrt_phi_base DIALECT phi)
add_mlir_dialect(infrt_phi_base phi)
add_mlir_dialect(infrt_phi_tensor phi_dt)
add_mlir_dialect(infrt_phi_kernel phi_kernel)
#mlir_tablegen_on(infrt_phi_tensor)
gather_srcs(infrt_src SRCS
phi_base.cc
infrt_phi_tensor.cc)
......@@ -4,7 +4,7 @@
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/infrt_base.td"
include "paddle/infrt/dialect/phi/infrt_phi_base.td"
include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
def PHI_KernelDialect : Dialect {
let name = "phi_kernel";
......
......@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include <mlir/IR/BuiltinTypes.h>
#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.cpp.inc"
namespace infrt {
namespace phi {
......@@ -25,7 +25,7 @@ namespace phi {
void PHIDenseTensorDialect::initialize() {
#define GET_OP_LIST
addOperations<
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"
>();
}
......@@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() {
} // namespace infrt
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc" // NOLINT
......@@ -29,11 +29,11 @@
#include <mlir/Interfaces/LoopLikeInterface.h>
#include <mlir/Interfaces/SideEffectInterfaces.h>
#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc"
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/phi/phi_base.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
// NOLINT
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
......@@ -2,7 +2,7 @@
#else
#define PHI_TENSOR
include "paddle/infrt/dialect/phi/infrt_phi_base.td"
include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/infrt_base.td"
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/phi/phi_base.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include <mlir/IR/Builders.h>
#include <mlir/IR/Dialect.h>
......@@ -21,8 +21,8 @@
#include <mlir/IR/TypeUtilities.h>
#include <mlir/IR/Types.h>
#include "paddle/infrt/common/global.h"
#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc"
namespace infrt {
namespace phi {
......@@ -51,11 +51,11 @@ void PHIDialect::printType(::mlir::Type type,
void PHIDialect::initialize() {
addOperations<
#define GET_OP_LIST
#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" // NOLINT
>();
addTypes<
#define GET_TYPEDEF_LIST
#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT
>();
}
......@@ -81,4 +81,4 @@ mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
} // namespace infrt
#define GET_TYPEDEF_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT
......@@ -19,11 +19,13 @@
#include <string>
#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
#define GET_TYPEDEF_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc"
namespace mlir {
namespace OpTrait {
......
......@@ -73,7 +73,7 @@ using ValueVariantType =
std::vector<phi::DenseTensor>,
paddle::experimental::ScalarBase<phi::DenseTensor>,
paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
std::vector<phi::MetaTensor>,
std::vector<phi::MetaTensor*>,
phi::MetaConfig,
paddle::experimental::Backend,
paddle::experimental::DataLayout,
......
......@@ -94,12 +94,16 @@ std::vector<Tensor> split_impl(const Tensor& x,
std::vector<Tensor> out;
auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out);
std::vector<phi::MetaTensor> meta_outs;
meta_outs.reserve(out_number);
std::vector<phi::MetaTensor*> meta_out_ptrs;
meta_out_ptrs.reserve(out_number);
for (size_t i = 0; i < out_number; ++i) {
meta_outs.push_back(dense_outs[i]);
meta_out_ptrs.push_back(&meta_outs.back());
}
phi::SplitInferMeta(
MakeMetaTensor(*dense_x), num_or_sections, axis, &meta_outs);
MakeMetaTensor(*dense_x), num_or_sections, axis, meta_out_ptrs);
using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&,
......
......@@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList(
return result;
}
void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
const phi::TensorArgDef& arg_def) {
VLOG(5) << "ResetTensor by TensorArgDef.";
if (phi::DenseTensor::classof(dst)) {
auto* dense_t = static_cast<phi::DenseTensor*>(dst);
auto* meta = phi::DenseTensorUtils::GetMutableMeta(dense_t);
meta->dtype = arg_def.dtype;
meta->layout = arg_def.layout;
} else if (phi::SelectedRows::classof(dst)) {
auto* selected_rows = static_cast<phi::SelectedRows*>(dst);
auto* meta =
phi::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value());
meta->dtype = arg_def.dtype;
meta->layout = arg_def.layout;
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"Unsupported tensor type is received when reseting tensor dtype and "
"layout by argument definition."));
}
}
} // namespace experimental
} // namespace paddle
......@@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable);
phi::ScalarArray MakePhiScalarArrayFromVarList(
const std::vector<framework::Variable*>& variable_list);
void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
const phi::TensorArgDef& arg_def);
} // namespace experimental
} // namespace paddle
......@@ -227,4 +227,12 @@ class GPUContext : public DeviceContext {
// must use different function name for cudnn kernel
using GPUDNNContext = GPUContext;
// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
// because we want to implement a KPS-based kernel and make it run
// on GPU and XPU at the same time, so we need KPSContext when registering
// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
using KPSContext = GPUContext;
#endif
} // namespace phi
......@@ -66,4 +66,12 @@ class XPUContext : public DeviceContext {
std::unique_ptr<Impl> impl_;
};
// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
// because we want to implement a KPS-based kernel and make it run
// on GPU and XPU at the same time, so we need KPSContext when registering
// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
#if PADDLE_WITH_XPU_KP
using KPSContext = XPUContext;
#endif
} // namespace phi
......@@ -52,6 +52,9 @@ enum class Backend : uint8_t {
MKLDNN,
GPUDNN, // cuDNN and hipDNN
// paddle kernel primitives backend
KPS,
// end of backend types
NUM_BACKENDS,
......@@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
case Backend::GPUDNN:
os << "GPUDNN";
break;
case Backend::KPS:
os << "KPS";
break;
default: {
size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(Backend::NUM_BACKENDS);
......@@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
return Backend::MKLDNN;
} else if (s == std::string("GPUDNN")) {
return Backend::GPUDNN;
} else if (s == std::string("KPS")) {
return Backend::KPS;
} else {
return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
phi::GetOrRegisterGlobalDeviceTypeId(s));
......
......@@ -22,8 +22,8 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_
cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
......
......@@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
case phi::Backend::XPU:
return phi::XPUPlace(
set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
#endif
case phi::Backend::KPS:
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return phi::GPUPlace(
set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
#elif defined(PADDLE_WITH_XPU_KP)
return phi::XPUPlace(
set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
#endif
default: {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
......
......@@ -20,16 +20,16 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
auto& kernel_info_map = custom_kernel_map.GetMap();
VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
auto& kernels = KernelFactory::Instance().kernels();
for (auto& pair : kernel_info_map) {
PADDLE_ENFORCE_EQ(
KernelFactory::Instance().HasCompatiblePhiKernel(pair.first),
true,
PADDLE_ENFORCE_NE(
kernels.find(pair.first),
kernels.end(),
phi::errors::InvalidArgument(
"The kernel %s is not ready for custom kernel registering.",
pair.first));
for (auto& info_pair : pair.second) {
auto& kernels = KernelFactory::Instance().kernels();
PADDLE_ENFORCE_EQ(
kernels[pair.first].find(info_pair.first),
kernels[pair.first].end(),
......
......@@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
// Note: When you reset holder, you need to ensure the offset is correct
void DenseTensor::ResetHolder(const std::shared_ptr<phi::Allocation>& holder) {
if (holder_) {
// TODO(zyfncg): The change of static_cast<> in check will recover back
// when SetAllocationForOutputTenosr is deleted.
// Now the numel() may return -1, and will cast to a very large number when
// compare with a data with unsigned long type, this will make checking
// failed, so it's a temporary solution to deal with this problem.
PADDLE_ENFORCE_LE(
numel() * static_cast<int64_t>(SizeOf(dtype())) +
static_cast<int64_t>(meta_.offset),
......
......@@ -75,13 +75,13 @@ paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt(
: paddle::optional<const phi::MetaTensor&>{paddle::none};
}
std::vector<MetaTensor> InferMetaContext::InputsBetween(size_t start,
std::vector<MetaTensor*> InferMetaContext::InputsBetween(size_t start,
size_t end) const {
std::vector<MetaTensor> result;
std::vector<MetaTensor*> result;
result.reserve(end - start);
for (size_t i = start; i < end; ++i) {
result.emplace_back(*inputs_.at(i));
result.push_back(inputs_.at(i).get());
}
return result;
......@@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) {
return outputs_.at(idx).get();
}
std::vector<MetaTensor> InferMetaContext::MutableOutputBetween(size_t start,
std::vector<MetaTensor*> InferMetaContext::MutableOutputBetween(size_t start,
size_t end) {
std::vector<MetaTensor> result;
std::vector<MetaTensor*> result;
result.reserve(end - start);
for (size_t i = start; i < end; ++i) {
result.emplace_back(*outputs_.at(i));
result.emplace_back(outputs_.at(i).get());
}
return result;
}
......
......@@ -50,13 +50,13 @@ class InferMetaContext {
const std::pair<int, int>& OutputRangeAt(size_t idx) const;
const MetaConfig& GetMetaConfig() const;
const MetaTensor& InputAt(size_t idx) const;
const MetaTensor& InputAt(size_t idx) const;
paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const;
std::vector<MetaTensor*> InputsBetween(size_t start, size_t end) const;
std::vector<MetaTensor> InputsBetween(size_t start, size_t end) const;
MetaTensor* MutableOutputAt(size_t idx);
std::vector<MetaTensor> MutableOutputBetween(size_t start, size_t end);
std::vector<MetaTensor*> MutableOutputBetween(size_t start, size_t end);
template <typename AttrType>
AttrType AttrAt(size_t idx) {
......@@ -157,7 +157,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
};
template <typename... Tail>
struct InferMetaFnCallHelper<const std::vector<MetaTensor>&, Tail...> {
struct InferMetaFnCallHelper<const std::vector<MetaTensor*>&, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
static_assert(attr_idx == 0,
......@@ -165,7 +165,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
static_assert(out_idx == 0,
"InferMeta's Input should appear before Outputs.");
const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
std::vector<MetaTensor> arg =
std::vector<MetaTensor*> arg =
ctx->InputsBetween(range.first, range.second);
InferMetaFnCallHelper<
Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
......@@ -210,13 +210,12 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
};
template <typename... Tail>
struct InferMetaFnCallHelper<std::vector<MetaTensor>*, Tail...> {
struct InferMetaFnCallHelper<std::vector<MetaTensor*>, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
std::vector<MetaTensor> tmp =
std::vector<MetaTensor*> arg =
ctx->MutableOutputBetween(range.first, range.second);
std::vector<MetaTensor>* arg = &tmp;
InferMetaFnCallHelper<
Tail...>::template Call<in_idx, attr_idx, out_idx + 1>(ctx,
pargs...,
......
......@@ -87,13 +87,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
default_tensor_layout,
default_key.dtype(),
arg_type);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
} else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
args_def->AppendInput(default_key.backend(),
default_tensor_layout,
default_key.dtype(),
arg_type);
#endif
} else if (arg_type == std::type_index(typeid(DenseTensor*))) {
args_def->AppendOutput(default_key.backend(),
default_tensor_layout,
......@@ -105,13 +103,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
default_tensor_layout,
default_key.dtype(),
arg_type);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
} else if (arg_type == std::type_index(typeid(SelectedRows*))) {
args_def->AppendOutput(default_key.backend(),
default_tensor_layout,
default_key.dtype(),
arg_type);
#endif
} else {
// Attribute deal with
// TODO(chenweihang): now here allow any types of attribute, maybe
......
......@@ -23,9 +23,7 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_context.h"
#ifndef PADDLE_WITH_CUSTOM_KERNEL
#include "paddle/phi/core/selected_rows.h"
#endif
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/type_defs.h"
......@@ -223,9 +221,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
#endif
PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
......@@ -260,9 +256,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
#endif
PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
......
......@@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/utils/any.h"
#include "paddle/utils/optional.h"
// Note: mixed_vector include many header now, LoD will be
// used on CUDA device? Can we use small_vector here?
// @zhanlve: Rollback to original LoD for now
#ifndef PADDLE_WITH_CUSTOM_KERNEL
#include "paddle/fluid/framework/mixed_vector.h"
#endif
namespace phi {
using DDim = phi::DDim;
......
......@@ -225,6 +225,41 @@ void HuberLossInferMeta(const MetaTensor& input,
out->share_lod(input);
}
void IndexSampleInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config) {
auto input_dims = x.dims();
PADDLE_ENFORCE_EQ(input_dims.size(),
2,
errors::InvalidArgument(
"Inputs(X) shape of IndexSample op should be 2-D, but "
"got X's shape = [%s], please check X shape.",
input_dims));
auto index_dims = y.dims();
PADDLE_ENFORCE_EQ(
index_dims.size(),
2,
errors::InvalidArgument(
"Inputs(Index) shape of IndexSample op should be 2-D, but "
"got Index's shape [%s] , please check index shape.",
input_dims));
if (config.is_runtime) {
PADDLE_ENFORCE_EQ(input_dims[0],
index_dims[0],
errors::InvalidArgument(
"Inputs(X)'s value of dimension 0 must same with "
"Inputs(Index)'s value of dimension 0, but "
"got %d of Inputs(X), and got %d of Inputs(Index), "
"please check Inputs shape.",
input_dims[0],
index_dims[0]));
}
out->set_dtype(x.dtype());
out->set_dims(index_dims);
out->share_lod(y);
}
void CrossInferMeta(const MetaTensor& x,
const MetaTensor& y,
int axis,
......@@ -271,8 +306,7 @@ void CrossInferMeta(const MetaTensor& x,
}
void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
auto in_dims = x.dims();
out->set_dims(in_dims);
out->share_meta(x);
}
void BCELossInferMeta(const MetaTensor& input,
......
......@@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
MetaTensor* residual,
MetaConfig config = MetaConfig());
void IndexSampleInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config = MetaConfig());
void CrossInferMeta(const MetaTensor& x,
const MetaTensor& y,
int axis,
......
......@@ -84,7 +84,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
out->set_dtype(x.dtype());
}
void ConcatInferMeta(const std::vector<MetaTensor>& x,
void ConcatInferMeta(const std::vector<MetaTensor*>& x,
const Scalar& axis_scalar,
MetaTensor* out,
MetaConfig config) {
......@@ -93,10 +93,19 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
phi::errors::InvalidArgument(
"The size of input meta vector should be greater"
"than 0."));
if (axis_scalar.FromTensor()) {
auto out_dims =
phi::make_ddim(std::vector<int>(x.at(0)->dims().size(), -1));
out->set_dims(out_dims);
out->set_dtype(x.at(0)->dtype());
out->set_layout(x.at(0)->layout());
out->share_lod(*x.at(0));
return;
}
int axis = axis_scalar.to<int>();
// 1. calculate axis
int rank = x.at(0).dims().size();
int rank = x.at(0)->dims().size();
PADDLE_ENFORCE_EQ(
axis >= -rank && axis < rank,
true,
......@@ -111,15 +120,42 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
// 2. calculate out dims
std::vector<phi::DDim> x_dims;
for (auto& x_t : x) {
x_dims.push_back(x_t.dims());
x_dims.reserve(x.size());
for (const auto* x_t : x) {
x_dims.emplace_back(x_t->dims());
}
phi::DDim out_dim =
phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis);
out->set_dims(out_dim);
out->set_dtype(x.at(0).dtype());
out->set_layout(x.at(0).layout());
out->set_dtype(x.at(0)->dtype());
out->set_layout(x.at(0)->layout());
out->share_lod(*x.at(0));
}
void WhereInferMeta(const MetaTensor& condition,
const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out) {
auto cond_dims = condition.dims();
auto x_dims = x.dims();
auto y_dims = y.dims();
PADDLE_ENFORCE_EQ(
cond_dims,
x_dims,
phi::errors::InvalidArgument(
"The dims of Inputs(Condition) and Inputs(X) should be same. "
"But received Condition's shape is [%s], X's shape is [%s]",
cond_dims,
x_dims));
PADDLE_ENFORCE_EQ(x_dims,
y_dims,
phi::errors::InvalidArgument(
"The dims of Inputs(X) and Inputs(Y) should be same. "
"But received X's shape is [%s], Y's shape is [%s]",
x_dims,
y_dims));
out->share_meta(x);
}
} // namespace phi
......@@ -25,9 +25,13 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
MetaTensor* out,
MetaConfig config = MetaConfig());
void ConcatInferMeta(const std::vector<MetaTensor>& x,
void ConcatInferMeta(const std::vector<MetaTensor*>& x,
const Scalar& axis_scalar,
MetaTensor* out,
MetaConfig config = MetaConfig());
void WhereInferMeta(const MetaTensor& condition,
const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out);
} // namespace phi
......@@ -459,8 +459,19 @@ void TransferLayoutInferMeta(const MetaTensor& x,
void SplitInferMeta(const MetaTensor& x,
const ScalarArray& num_or_sections,
const Scalar& axis,
std::vector<MetaTensor>* out,
std::vector<MetaTensor*> out,
MetaConfig config) {
if (!config.is_runtime) {
if (axis.FromTensor() || num_or_sections.FromTensor()) {
auto out_dims = phi::make_ddim(std::vector<int>(x.dims().size(), -1));
for (auto* item : out) {
item->set_dims(out_dims);
item->share_lod(x);
}
return;
}
}
int axis_value = axis.to<int>();
int rank = x.dims().size();
PADDLE_ENFORCE_EQ(
......@@ -475,15 +486,16 @@ void SplitInferMeta(const MetaTensor& x,
axis_value = axis_value + rank;
}
std::vector<phi::DDim> out_dims(out.size(), x.dims());
auto input_axis_dim = x.dims().at(axis_value);
auto num_or_sections_data = num_or_sections.GetData();
// step1: get formated sections
std::vector<int64_t> sections;
// num_or_sections is a number
if (num_or_sections_data.size() == 1) {
if (config.is_runtime || input_axis_dim > 0) {
int num = num_or_sections_data.at(0);
PADDLE_ENFORCE_EQ(input_axis_dim % num,
PADDLE_ENFORCE_EQ(
input_axis_dim % num,
0,
phi::errors::InvalidArgument(
"The input's size along the split dimension "
......@@ -494,8 +506,14 @@ void SplitInferMeta(const MetaTensor& x,
x.dims(),
axis_value));
for (int i = 0; i < num; ++i) {
sections.push_back(input_axis_dim / num);
size_t out_axis_dim = input_axis_dim / num;
for (auto& out_dim : out_dims) {
out_dim[axis_value] = out_axis_dim;
}
} else {
for (auto& out_dim : out_dims) {
out_dim[axis_value] = -1;
}
}
} else {
// num_or_sections is a sections
......@@ -503,10 +521,9 @@ void SplitInferMeta(const MetaTensor& x,
int unknow_dim_idx = -1;
int num_of_unknow = 0;
int sum_of_section = 0;
std::vector<int64_t> sections = num_or_sections_data;
for (size_t i = 0; i < num_or_sections_data.size(); ++i) {
sections.push_back(num_or_sections_data[i]);
if (num_or_sections_data[i] == unknow_dim_val) {
num_of_unknow++;
unknow_dim_idx = i;
......@@ -558,31 +575,22 @@ void SplitInferMeta(const MetaTensor& x,
x.dims(),
axis_value));
}
}
// setp2: fill out dims
std::vector<phi::DDim> out_dims(sections.size(), x.dims());
if (config.is_runtime || input_axis_dim > 0) {
for (size_t i = 0; i < sections.size(); ++i) {
for (size_t i = 0; i < out_dims.size(); ++i) {
out_dims[i][axis_value] = sections[i];
}
} else {
for (size_t i = 0; i < sections.size(); ++i) {
out_dims[i][axis_value] = -1;
}
}
for (size_t i = 0; i < sections.size(); ++i) {
for (size_t i = 0; i < out.size(); ++i) {
if (axis_value != 0) {
// Only pass LoD when not spliting along the first dim.
(*out)[i].set_dtype(x.dtype());
(*out)[i].set_dims(out_dims[i]);
(*out)[i].set_layout(x.layout());
out.at(i)->set_dtype(x.dtype());
out.at(i)->set_dims(out_dims[i]);
out.at(i)->set_layout(x.layout());
} else {
(*out)[i].set_dtype(x.dtype());
(*out)[i].set_dims(out_dims[i]);
(*out)[i].set_layout(x.layout());
(*out)[i].share_lod(x);
out.at(i)->set_dtype(x.dtype());
out.at(i)->set_dims(out_dims[i]);
out.at(i)->set_layout(x.layout());
out.at(i)->share_lod(x);
}
}
}
......
......@@ -107,7 +107,7 @@ void TransferLayoutInferMeta(const MetaTensor& x,
void SplitInferMeta(const MetaTensor& x_meta,
const ScalarArray& num_or_sections,
const Scalar& axis,
std::vector<MetaTensor>* out,
std::vector<MetaTensor*> out,
MetaConfig config = MetaConfig());
void UnbindInferMeta(const MetaTensor& x,
......
......@@ -31,13 +31,16 @@ DenseTensor Concat(const Context& dev_ctx,
const std::vector<DenseTensor>& x,
const Scalar& axis) {
std::vector<MetaTensor> meta_x;
meta_x.reserve(x.size());
std::vector<MetaTensor*> meta_x_ptr;
for (const auto& t : x) {
meta_x.emplace_back(t);
meta_x_ptr.push_back(&meta_x.back());
}
auto dense_out = phi::Empty<T, Context>(dev_ctx);
MetaTensor meta_out(&dense_out);
ConcatInferMeta(meta_x, axis.to<int>(), &meta_out, /*is_runtime=*/true);
ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
return dense_out;
}
......
......@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/atan2_grad_kernel.h"
#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
PD_REGISTER_KERNEL(atan2_grad,
CPU,
......
......@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/atan2_kernel.h"
#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
PD_REGISTER_KERNEL(atan2,
CPU,
......
......@@ -37,6 +37,7 @@ void ConcatKernel(const Context& dev_ctx,
axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
std::vector<phi::DDim> x_dims;
x_dims.reserve(x.size());
for (size_t i = 0; i < x.size(); ++i) {
x_dims.push_back(x[i].dims());
}
......@@ -97,9 +98,10 @@ void ConcatKernel(const Context& dev_ctx,
}
} else {
std::vector<phi::DenseTensor> inputs;
inputs.reserve(x.size());
for (size_t j = 0; j < x.size(); ++j) {
if (x[j].numel() > 0) {
inputs.push_back(x[j]);
inputs.emplace_back(x[j]);
} else {
continue;
}
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_sample_grad_kernel.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context, typename IndexT = int>
void IndexSampleGradInner(const Context& context,
const DenseTensor& out_grad,
const DenseTensor& index,
DenseTensor* x_grad) {
std::vector<T> out_grad_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(out_grad, context, &out_grad_vec);
paddle::framework::TensorToVector(index, context, &index_vec);
auto index_dims = index.dims();
auto x_grad_dims = x_grad->dims();
auto value_length = x_grad_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> x_grad_vec(x_grad->numel(), 0);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i],
0,
errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length,
index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i],
value_length,
errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length,
index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
x_grad_vec[v_i] += out_grad_vec[i];
}
context.template Alloc<T>(x_grad);
paddle::framework::TensorFromVector(x_grad_vec, context, x_grad);
x_grad->Resize(x_grad_dims);
}
template <typename T, typename Context>
void IndexSampleGradKernel(const Context& ctx,
const DenseTensor& out_grad,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* x_grad) {
auto index_type = index.dtype();
bool index_type_match =
index_type == DataType::INT32 || index_type == DataType::INT64;
PADDLE_ENFORCE_EQ(
index_type_match,
true,
errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(index_type)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT32)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType((DataType::INT64)))));
if (index_type == DataType::INT32) {
IndexSampleGradInner<T, Context, int>(ctx, out_grad, index, x_grad);
} else if (index_type == DataType::INT64) {
IndexSampleGradInner<T, Context, int64_t>(ctx, out_grad, index, x_grad);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_sample_grad,
CPU,
ALL_LAYOUT,
phi::IndexSampleGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_sample_kernel.h"
#include <cmath>
#include <fstream>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context, typename IndexT = int>
void IndexSampleInner(const Context &context,
const DenseTensor &input,
const DenseTensor &index,
DenseTensor *output) {
auto input_dims = input.dims();
auto index_dims = index.dims();
int batch_size = input_dims[0];
auto value_length = input_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> input_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(input, context, &input_vec);
paddle::framework::TensorToVector(index, context, &index_vec);
std::vector<T> res(index_ids_num);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i],
0,
errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length,
index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i],
value_length,
errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length,
index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
T v = input_vec[v_i];
VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
<< " value = " << v;
res[i] = v;
}
auto ddim = phi::make_ddim({batch_size, index_length});
context.template Alloc<T>(output);
paddle::framework::TensorFromVector(res, context, output);
output->Resize(ddim);
}
template <typename T, typename Context>
void IndexSampleKernel(const Context &ctx,
const DenseTensor &x,
const DenseTensor &index,
DenseTensor *out) {
ctx.template Alloc<T>(out);
auto index_type = index.dtype();
bool index_type_match =
index_type == DataType::INT32 || index_type == DataType::INT64;
PADDLE_ENFORCE_EQ(
index_type_match,
true,
errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(index_type)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT32)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType((DataType::INT64)))));
if (index_type == DataType::INT32) {
IndexSampleInner<T, Context, int>(ctx, x, index, out);
} else if (index_type == DataType::INT64) {
IndexSampleInner<T, Context, int64_t>(ctx, x, index, out);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_sample,
CPU,
ALL_LAYOUT,
phi::IndexSampleKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/logical_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/elementwise.h"
#include "paddle/phi/kernels/funcs/logical_functor.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/transform.h"
namespace phi {
#define DEFINE_LOGICAL_BINARY_KERNEL(type) \
template <typename T, typename Context> \
void Logical##type##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out) { \
funcs::Logical##type##Functor<T> binary_func; \
ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
dev_ctx, x, y, -1, binary_func, out); \
}
DEFINE_LOGICAL_BINARY_KERNEL(And)
DEFINE_LOGICAL_BINARY_KERNEL(Or)
DEFINE_LOGICAL_BINARY_KERNEL(Xor)
#undef DEFINE_LOGICAL_BINARY_KERNEL
template <typename T, typename Context>
void LogicalNotKernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out) {
auto* out_ptr = dev_ctx.template Alloc<bool>(out);
funcs::LogicalNotFunctor<T> unary_func;
paddle::platform::Transform<Context> trans;
trans(dev_ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func);
}
} // namespace phi
#define REGISTER_LOGICAL_CPU_KERNEL(logical_and, func_type) \
PD_REGISTER_KERNEL(logical_and, \
CPU, \
ALL_LAYOUT, \
phi::Logical##func_type##Kernel, \
float, \
double, \
bool, \
int64_t, \
int, \
int8_t, \
int16_t) {}
REGISTER_LOGICAL_CPU_KERNEL(logical_and, And)
REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or)
REGISTER_LOGICAL_CPU_KERNEL(logical_not, Not)
REGISTER_LOGICAL_CPU_KERNEL(logical_xor, Xor)
......@@ -28,20 +28,6 @@ void SplitKernel(const Context& dev_ctx,
const ScalarArray& num_or_sections,
const Scalar& axis_scalar,
std::vector<DenseTensor*> outs) {
// need to infershape output
if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
std::vector<MetaTensor> out_metas;
for (size_t i = 0; i < outs.size(); ++i) {
out_metas.push_back(outs[i]);
}
phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
for (size_t i = 0; i < out_metas.size(); ++i) {
outs[i]->Resize(out_metas[i].dims());
}
}
std::vector<const DenseTensor*> shape_refer;
for (size_t j = 0; j < outs.size(); ++j) {
dev_ctx.template Alloc<T>(outs[j]);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/uniform_random_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T>
inline void UniformRealDistribution(T *data,
const int64_t &size,
const float &min,
const float &max,
std::shared_ptr<std::mt19937_64> engine) {
std::uniform_real_distribution<T> dist(static_cast<T>(min),
static_cast<T>(max));
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(*engine);
}
}
template <>
inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
const int64_t &size,
const float &min,
const float &max,
std::shared_ptr<std::mt19937_64> engine) {
std::uniform_real_distribution<float> dist(min, max);
for (int64_t i = 0; i < size; ++i) {
data[i] = static_cast<phi::dtype::bfloat16>(dist(*engine));
}
}
template <typename T, typename Context>
void UniformRandomRawKernel(const Context &dev_ctx,
const ScalarArray &shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
DenseTensor *out) {
out->Resize(phi::make_ddim(shape.GetData()));
VLOG(4) << out->dims();
T *data = dev_ctx.template Alloc<T>(out);
auto size = out->numel();
std::shared_ptr<std::mt19937_64> engine;
if (seed) {
engine = std::make_shared<std::mt19937_64>();
engine->seed(seed);
} else {
engine = dev_ctx.GetGenerator()->GetCPUEngine();
}
UniformRealDistribution<T>(data, size, min, max, engine);
if (diag_num > 0) {
PADDLE_ENFORCE_GT(
size,
(diag_num - 1) * (diag_step + 1),
phi::errors::InvalidArgument(
"ShapeInvalid: the diagonal's elements is equal (num-1) "
"* (step-1) with num %d, step %d,"
"It should be smaller than %d, but received %d",
diag_num,
diag_step,
(diag_num - 1) * (diag_step + 1),
size));
for (int64_t i = 0; i < diag_num; ++i) {
int64_t pos = i * diag_step + i;
data[pos] = diag_val;
}
}
}
template <typename T, typename Context>
void UniformRandomKernel(const Context &dev_ctx,
const ScalarArray &shape,
DataType dtype,
float min,
float max,
int seed,
DenseTensor *out) {
UniformRandomRawKernel<T>(
dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
}
} // namespace phi
PD_REGISTER_KERNEL(uniform_random_raw,
CPU,
ALL_LAYOUT,
phi::UniformRandomRawKernel,
float,
double,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(uniform_random,
CPU,
ALL_LAYOUT,
phi::UniformRandomKernel,
float,
double,
phi::dtype::bfloat16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/where_grad_kernel.h"
namespace phi {
template <typename T, typename Context>
void WhereGradKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
DenseTensor* x_grad,
DenseTensor* y_grad) {
const auto* cond_data = condition.data<bool>();
auto numel = condition.numel();
auto* dout = out_grad.data<T>();
if (x_grad != nullptr) {
auto* dx = ctx.template Alloc<T>(x_grad);
for (int i = 0; i < numel; i++) {
dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
}
}
if (y_grad != nullptr) {
auto* dy = ctx.template Alloc<T>(y_grad);
for (int i = 0; i < numel; i++) {
dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(where_grad,
CPU,
ALL_LAYOUT,
phi::WhereGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/where_kernel.h"
namespace phi {
template <typename T, typename Context>
void WhereKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
const bool* cond_data = condition.data<bool>();
const T* x_data = x.data<T>();
const T* y_data = y.data<T>();
auto x_numel = x.numel();
T* out_data = ctx.template Alloc<T>(out);
for (int i = 0; i < x_numel; i++) {
out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
}
}
} // namespace phi
PD_REGISTER_KERNEL(
where, CPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.1 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.1
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/core/hostdevice.h"
namespace phi {
// Aligned vector generates vectorized load/store on CUDA.
template <typename T, int Size>
struct alignas(sizeof(T) * Size) AlignedVector {
T val[Size];
HOSTDEVICE inline const T& operator[](int i) const { return val[i]; }
HOSTDEVICE inline T& operator[](int i) { return val[i]; }
};
template <typename T, int Size>
HOSTDEVICE inline void Load(const T* addr, AlignedVector<T, Size>* vec) {
const AlignedVector<T, Size>* addr_vec =
reinterpret_cast<const AlignedVector<T, Size>*>(addr);
*vec = *addr_vec;
}
template <typename T, int Size>
HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
AlignedVector<T, Size>* addr_vec =
reinterpret_cast<AlignedVector<T, Size>*>(addr);
*addr_vec = vec;
}
/*
* Only the address of input data is the multiplier of 1,2,4, vectorized load
* with corresponding multiplier-value is possible. Moreover, the maximum length
* of vectorized load is 128 bits once. Hence, valid length of vectorized load
* shall be determined under both former constraints.
*/
template <typename T>
int GetVectorizedSize(const T* pointer) {
constexpr int max_load_bits = 128;
int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
uint64_t address = reinterpret_cast<uint64_t>(pointer);
constexpr int vec8 = std::alignment_of<AlignedVector<T, 8>>::value; // NOLINT
constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value; // NOLINT
constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value; // NOLINT
if (address % vec8 == 0) {
/*
* Currently, decide to deal with no more than 4 data once while adopting
* vectorization load/store, if performance test shows that dealing with
* 8 data once in vectorization load/store does get optimized, return code
* below can be changed into " return std::min(8, valid_vec_size); " .
*/
return std::min(4, valid_vec_size);
} else if (address % vec4 == 0) {
return std::min(4, valid_vec_size);
} else if (address % vec2 == 0) {
return std::min(2, valid_vec_size);
} else {
return 1;
}
}
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef __NVCC__
#include <curand_kernel.h>
#endif
#ifdef __HIPCC__
#include <hiprand_kernel.h>
#endif
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/generator.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
#endif
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition)
#endif
namespace phi {
namespace distribution {
/********************* Transformation Function **********************/
template <typename T>
struct exponential_transform {
explicit exponential_transform(T lambda) : lambda_(lambda) {}
HOSTDEVICE inline T operator()(T val) const {
#if defined(__NVCC__) || defined(__HIPCC__)
if (std::is_same<T, double>::value) {
return static_cast<T>(-1.0) / lambda_ * log(val);
} else {
return static_cast<T>(-1.0) / lambda_ * __logf(val);
}
#else
return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
#endif
}
private:
T lambda_;
};
template <typename T>
struct uniform_transform {
explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
HOSTDEVICE inline T operator()(T val) const {
if (UNLIKELY(val == static_cast<T>(1.0))) {
return min_;
} else {
return val * range_ + min_;
}
}
private:
T range_;
T min_;
};
template <typename T>
struct normal_transform {
explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
private:
T mean_;
T std_;
};
#if defined(__NVCC__) || defined(__HIPCC__)
namespace kps = phi::kps;
/*********************** Distribution Function *************************/
template <typename T>
struct uniform_distribution;
template <typename T>
struct normal_distribution;
#if defined(__NVCC__)
template <>
struct uniform_distribution<float> {
__device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
return curand_uniform4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct uniform_distribution<double> {
__device__ inline double2 operator()(
curandStatePhilox4_32_10_t *state) const {
return curand_uniform2_double(state);
}
static constexpr int kReturnsCount = 2;
};
template <>
struct normal_distribution<float> {
__device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
return curand_normal4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct normal_distribution<double> {
__device__ inline double2 operator()(
curandStatePhilox4_32_10_t *state) const {
return curand_normal2_double(state);
}
static constexpr int kReturnsCount = 2;
};
#else
template <>
struct uniform_distribution<float> {
__device__ inline float4 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_uniform4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct uniform_distribution<double> {
__device__ inline double2 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_uniform2_double(state);
}
static constexpr int kReturnsCount = 2;
};
template <>
struct normal_distribution<float> {
__device__ inline float4 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_normal4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct normal_distribution<double> {
__device__ inline double2 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_normal2_double(state);
}
static constexpr int kReturnsCount = 2;
};
#endif
/******** Launch GPU function of distribution and transformation *********/
template <typename T, typename DistOp, typename TransformOp>
__global__ void DistributionKernel(size_t size,
uint64_t seed,
uint64_t offset,
DistOp dist,
TransformOp trans,
T *out_data,
size_t stride) {
size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
static constexpr int kCount = DistOp::kReturnsCount;
#if defined(__NVCC__)
curandStatePhilox4_32_10_t state;
curand_init(seed, idx + THREAD_ID_X, offset, &state);
using SType = curandStatePhilox4_32_10_t;
#else
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
using SType = hiprandStatePhilox4_32_10_t;
#endif
size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
T args[kCount];
T result[kCount];
for (size_t i = idx; i < size; i += total_thread * kCount) {
kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(
&result[0], &args[0], trans);
kps::WriteData<T, T, kCount, 1, 1, true>(
out_data + i, &result[0], size - i, 1, stride, 1);
__syncthreads();
}
}
template <typename T, typename DistOp, typename TransformOp>
void distribution_and_transform(const GPUContext &dev_ctx,
DenseTensor *out,
DistOp dist,
TransformOp trans) {
T *out_data = dev_ctx.template Alloc<T>(out);
auto size = out->numel();
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
auto gen_cuda = dev_ctx.GetGenerator();
size_t block_size = 256;
size_t expect_grid_size = (size + block_size - 1) / block_size;
const auto &prop = backends::gpu::GetDeviceProperties(device_id);
size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
prop.multiProcessorCount;
size_t grid_size =
expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
size_t total_thread = block_size * grid_size;
size_t curand4_loop_times =
(size + 4 * total_thread - 1) / (4 * total_thread);
// 'increment' shoulde be multiple of 4
uint64_t increment = curand4_loop_times * 4;
auto seed_offset = gen_cuda->IncrementOffset(increment);
uint64_t seed = seed_offset.first;
uint64_t offset = seed_offset.second;
DistributionKernel<
T,
DistOp,
TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
size, seed, offset, dist, trans, out_data, total_thread);
}
#endif
} // namespace distribution
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
namespace phi {
template <typename T, typename Functor, int VecSize>
__global__ void VectorizedIndexKernel(T *out,
size_t numel,
size_t main_offset,
Functor func) {
size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
size_t args[VecSize];
T result[VecSize];
for (; data_offset < main_offset; data_offset += stride) {
kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
&result[0], &args[0], func);
kps::WriteData<T, VecSize, 1, 1, false>(
out + data_offset, &result[0], BLOCK_NUM_X * VecSize);
}
size_t num = numel - data_offset;
if (num > 0) {
kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
&result[0], &args[0], func);
kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
}
}
template <typename T, typename Functor>
void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) {
int numel = out->numel();
T *out_data = dev_ctx.template Alloc<T>(out);
if (numel <= 0) return;
int vec_size = phi::GetVectorizedSize(out_data);
#ifdef PADDLE_WITH_XPU_KP
int block = 64;
int grid = 8;
auto stream = dev_ctx.x_context()->xpu_stream;
#else
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
int grid = config.block_per_grid.x;
int block = config.thread_per_block.x;
auto stream = dev_ctx.stream();
#endif
size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
switch (vec_size) {
case 4:
VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
out_data, numel, main_offset, func);
break;
case 2:
VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
out_data, numel, main_offset, func);
break;
case 1:
VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
out_data, numel, main_offset, func);
break;
default: {
PADDLE_THROW(phi::errors::Unimplemented(
"Unsupported vectorized size: %d !", vec_size));
break;
}
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace phi {
namespace funcs {
#define LOGICAL_BINARY_FUNCTOR(func_name, op) \
template <typename T> \
struct func_name { \
using ELEMENT_TYPE = T; \
HOSTDEVICE bool operator()(const T a, const T b) const { \
return static_cast<bool>(a) op static_cast<bool>(b); \
} \
};
LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
#undef LOGICAL_BINARY_FUNCTOR
template <typename T>
struct LogicalNotFunctor {
using ELEMENT_TYPE = T;
HOSTDEVICE bool operator()(const T a) const { return !a; }
};
} // namespace funcs
} // namespace phi
......@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/atan2_grad_kernel.h"
#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
PD_REGISTER_KERNEL(atan2_grad,
GPU,
......
......@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/atan2_kernel.h"
#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
PD_REGISTER_KERNEL(atan2,
GPU,
......
......@@ -80,8 +80,4 @@ void CastKernel(const Context& dev_ctx,
paddle::experimental::DataType::UNDEFINED); \
}
#if !defined(PADDLE_WITH_HIP)
PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, phi::dtype::bfloat16)
#else
PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_sample_grad_kernel.h"
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
namespace {
template <typename Context>
void LimitGridDim(const Context& ctx, dim3* grid_dim) {
auto max_grid_dim =
reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
}
#define PREDEFINED_BLOCK_SIZE_X 512
#define PREDEFINED_BLOCK_SIZE 1024
#define MIN(a, b) ((a) < (b) ? (a) : (b))
};
template <typename T, typename IndexT = int>
__global__ void IndexSampleGrad(const IndexT* index,
T* in_grad,
const T* out_grad,
size_t index_length,
size_t input_length,
size_t batch_size,
bool same_data_in_row = true) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
if (same_data_in_row) {
paddle::platform::CudaAtomicAdd(
&(in_grad[in_idx - index_i + sample_idx]), out_grad[sample_idx]);
} else {
in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
}
}
}
}
template <typename T, typename Context>
void IndexSampleGradKernel(const Context& ctx,
const DenseTensor& out_grad,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* x_grad) {
const T* output_grad_data = out_grad.data<T>();
T* input_grad_data = ctx.template Alloc<T>(x_grad);
auto index_type = index.dtype();
bool index_type_match =
index_type == DataType::INT32 || index_type == DataType::INT64;
PADDLE_ENFORCE_EQ(
index_type_match,
true,
errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(index_type)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT32)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType((DataType::INT64)))));
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
auto input_num = x.numel();
auto input_dim = x.dims();
auto index_dim = index.dims();
size_t batch_size = index_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
bool same_data_in_index_row = index_length == 1 ? false : true;
auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
auto block_height =
paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(ctx, x_grad, static_cast<T>(0));
if (index_type == DataType::INT64) {
const int64_t* index_data = index.data<int64_t>();
IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data,
input_grad_data,
output_grad_data,
index_length,
input_length,
batch_size,
same_data_in_index_row);
} else if (index_type == DataType::INT32) {
const int* index_data = index.data<int>();
IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data,
input_grad_data,
output_grad_data,
index_length,
input_length,
batch_size,
same_data_in_index_row);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_sample_grad,
GPU,
ALL_LAYOUT,
phi::IndexSampleGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_sample_kernel.h"
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
namespace {
template <typename Context>
void LimitGridDim(const Context& ctx, dim3* grid_dim) {
auto max_grid_dim =
reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
}
#define PREDEFINED_BLOCK_SIZE_X 512
#define PREDEFINED_BLOCK_SIZE 1024
#define MIN(a, b) ((a) < (b) ? (a) : (b))
}
template <typename T, typename IndexT = int>
__global__ void IndexSampleForward(const IndexT* index,
const T* in_data,
T* out_data,
size_t index_length,
size_t input_length,
size_t batch_size) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
}
}
}
template <typename T, typename Context>
void IndexSampleKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* out) {
auto index_type = index.dtype();
bool index_type_match =
index_type == DataType::INT32 || index_type == DataType::INT64;
PADDLE_ENFORCE_EQ(
index_type_match,
true,
errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(index_type)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType(DataType::INT32)),
paddle::framework::DataTypeToString(
paddle::framework::TransToProtoVarType((DataType::INT64)))));
const T* in_data = x.data<T>();
T* out_data = ctx.template Alloc<T>(out);
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
auto input_dim = x.dims();
auto index_dim = index.dims();
size_t batch_size = input_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
int block_height =
paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
if (index_type == DataType::INT64) {
const int64_t* index_data = index.data<int64_t>();
IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length, batch_size);
} else if (index_type == DataType::INT32) {
const int* index_data = index.data<int>();
IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length, batch_size);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_sample,
GPU,
ALL_LAYOUT,
phi::IndexSampleKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/logical_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/logical_functor.h"
#include "paddle/phi/kernels/gpu/elementwise.h"
namespace phi {
#define DEFINE_LOGICAL_BINARY_KERNEL(type) \
template <typename T, typename Context> \
void Logical##type##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out) { \
using InT = typename funcs::Logical##type##Functor<T>::ELEMENT_TYPE; \
using OutT = bool; \
dev_ctx.template Alloc<bool>(out); \
funcs::Logical##type##Functor<T> binary_func; \
std::vector<const DenseTensor*> ins = {&x, &y}; \
std::vector<DenseTensor*> outs = {out}; \
funcs::BroadcastKernel<ElementwiseType::kBinary, InT, OutT>( \
dev_ctx, ins, &outs, -1, binary_func); \
}
DEFINE_LOGICAL_BINARY_KERNEL(And)
DEFINE_LOGICAL_BINARY_KERNEL(Or)
DEFINE_LOGICAL_BINARY_KERNEL(Xor)
#undef DEFINE_LOGICAL_BINARY_KERNEL
template <typename T, typename Context>
void LogicalNotKernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out) {
using InT = typename funcs::LogicalNotFunctor<T>::ELEMENT_TYPE;
using OutT = bool;
dev_ctx.template Alloc<bool>(out);
funcs::LogicalNotFunctor<T> unary_func;
std::vector<const DenseTensor*> ins = {&x};
std::vector<DenseTensor*> outs = {out};
funcs::BroadcastKernel<ElementwiseType::kUnary, InT, OutT>(
dev_ctx, ins, &outs, -1, unary_func);
}
} // namespace phi
#define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \
PD_REGISTER_KERNEL(logical_and, \
GPU, \
ALL_LAYOUT, \
phi::Logical##func_type##Kernel, \
float, \
double, \
bool, \
int64_t, \
int, \
int8_t, \
int16_t) {}
REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And)
REGISTER_LOGICAL_CUDA_KERNEL(logical_or, Or)
REGISTER_LOGICAL_CUDA_KERNEL(logical_not, Not)
REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, Xor)
......@@ -155,6 +155,7 @@ PD_REGISTER_KERNEL(sum_raw,
float,
double,
float16,
bfloat16,
int16_t,
int,
int64_t,
......
......@@ -70,6 +70,7 @@ PD_REGISTER_KERNEL(scale,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16,
uint8_t,
int8_t,
int16_t,
......
......@@ -27,20 +27,6 @@ void SplitKernel(const Context& dev_ctx,
const ScalarArray& num_or_sections,
const Scalar& axis_scalar,
std::vector<DenseTensor*> outs) {
// need to infershape output
if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
std::vector<MetaTensor> out_metas;
for (size_t i = 0; i < outs.size(); ++i) {
out_metas.push_back(outs[i]);
}
phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
for (size_t i = 0; i < out_metas.size(); ++i) {
outs[i]->Resize(out_metas[i].dims());
}
}
std::vector<const DenseTensor*> shape_refer;
for (size_t j = 0; j < outs.size(); ++j) {
dev_ctx.template Alloc<T>(outs[j]);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/uniform_random_kernel.h"
#include "gflags/gflags.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
DECLARE_bool(use_curand);
namespace phi {
template <typename T>
struct UniformGenerator {
T min_, max_;
unsigned int seed_;
T diag_val_;
unsigned int diag_num_;
unsigned int diag_step_;
__host__ __device__ UniformGenerator(
T min, T max, int seed, int diag_num, int diag_step, T diag_val)
: min_(min),
max_(max),
seed_(seed),
diag_num_(diag_num),
diag_step_(diag_step),
diag_val_(diag_val) {}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed_);
thrust::uniform_real_distribution<T> dist(min_, max_);
rng.discard(n);
T out = dist(rng);
unsigned int remainder = n % (diag_step_ + 1);
if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
out = diag_val_;
}
return out;
}
};
template <typename T>
struct UniformGeneratorOffset {
T min_, max_;
unsigned int seed_;
T diag_val_;
unsigned int diag_num_;
unsigned int diag_step_;
int offset_;
__host__ __device__ UniformGeneratorOffset(T min,
T max,
int seed,
int diag_num,
int diag_step,
T diag_val,
int offset)
: min_(min),
max_(max),
seed_(seed),
diag_num_(diag_num),
diag_step_(diag_step),
diag_val_(diag_val),
offset_(offset) {}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed_);
thrust::uniform_real_distribution<T> dist(min_, max_);
rng.discard(n + offset_);
T out = dist(rng);
unsigned int remainder = n % (diag_step_ + 1);
if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
out = diag_val_;
}
return out;
}
};
template <typename T, typename Context>
void UniformRandomRawKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
DenseTensor* out) {
out->Resize(phi::make_ddim(shape.GetData()));
T* data = dev_ctx.template Alloc<T>(out);
auto size = out->numel();
bool seed_flag = false;
if (seed == 0) {
std::random_device rd;
seed = rd();
seed_flag = true;
}
auto generator = dev_ctx.GetGenerator();
if (generator->GetIsInitPy() && seed_flag) {
if (FLAGS_use_curand) {
using MT = typename kps::details::MPTypeTrait<T>::Type;
distribution::uniform_distribution<MT> dist;
distribution::uniform_transform<MT> trans(min, max);
distribution::distribution_and_transform<T>(dev_ctx, out, dist, trans);
} else {
auto seed_offset = generator->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
auto func = UniformGeneratorOffset<T>(min,
max,
seed_offset.first,
diag_num,
diag_step,
diag_val,
gen_offset);
IndexKernel<T, UniformGeneratorOffset<T>>(dev_ctx, out, func);
}
} else {
auto func =
UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
IndexKernel<T, UniformGenerator<T>>(dev_ctx, out, func);
}
}
template <typename T, typename Context>
void UniformRandomKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
DenseTensor* out) {
UniformRandomRawKernel<T>(
dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
}
} // namespace phi
PD_REGISTER_KERNEL(uniform_random_raw,
GPU,
ALL_LAYOUT,
phi::UniformRandomRawKernel,
float,
double) {}
PD_REGISTER_KERNEL(
uniform_random, GPU, ALL_LAYOUT, phi::UniformRandomKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/where_grad_kernel.h"
namespace phi {
template <typename T>
__global__ void WhereGradCUDAKernel(
const int N, const T* dout, const bool* cond, T* dx, T* dy) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < N; idx += blockDim.x * gridDim.x) {
if (dx != nullptr) {
dx[idx] = cond[idx] ? dout[idx] : 0.;
}
if (dy != nullptr) {
dy[idx] = cond[idx] ? 0. : dout[idx];
}
}
}
template <typename T, typename Context>
void WhereGradKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
DenseTensor* x_grad,
DenseTensor* y_grad) {
const bool* cond_data = condition.data<bool>();
auto numel = condition.numel();
auto* dout = out_grad.data<T>();
T* dx = (x_grad != nullptr) ? ctx.template Alloc<T>(x_grad) : nullptr;
T* dy = (y_grad != nullptr) ? ctx.template Alloc<T>(y_grad) : nullptr;
auto stream = ctx.stream();
auto config = backends::gpu::GetGpuLaunchConfig1D(ctx, numel);
WhereGradCUDAKernel<
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
numel, dout, cond_data, dx, dy);
}
} // namespace phi
PD_REGISTER_KERNEL(where_grad,
GPU,
ALL_LAYOUT,
phi::WhereGradKernel,
float,
double,
int,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/where_kernel.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
namespace phi {
// Cond
template <typename T>
struct CondFunctor {
inline HOSTDEVICE T operator()(const bool cond, const T x, const T y) const {
return cond ? x : y;
}
};
template <typename T, typename Context>
void WhereKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
std::vector<const DenseTensor*> ins = {&condition, &x, &y};
std::vector<DenseTensor*> outs = {out};
ctx.template Alloc<T>(out);
CondFunctor<T> func;
funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
ctx, ins, &outs, -1, func);
}
} // namespace phi
PD_REGISTER_KERNEL(
where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
......@@ -14,9 +14,10 @@
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/atan2_grad_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
......
......@@ -14,9 +14,10 @@
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/atan2_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T>
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void IndexSampleGradKernel(const Context& ctx,
const DenseTensor& out_grad,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* in_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void IndexSampleKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& index,
DenseTensor* out);
} // namespace phi
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
#define DECLEAR_LOGICAL_BINARY_KERNEL(type) \
template <typename T, typename Context> \
void Logical##type##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out);
DECLEAR_LOGICAL_BINARY_KERNEL(And)
DECLEAR_LOGICAL_BINARY_KERNEL(Or)
DECLEAR_LOGICAL_BINARY_KERNEL(Xor)
#undef DECLEAR_LOGICAL_BINARY_KERNEL
template <typename T, typename Context>
void LogicalNotKernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out);
} // namespace phi
......@@ -165,6 +165,7 @@ PD_REGISTER_KERNEL(sum,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16,
int16_t,
int,
int64_t,
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/uniform_random_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void UniformRandomRawSRKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
SelectedRows* out) {
phi::UniformRandomRawKernel<T>(dev_ctx,
shape,
dtype,
min,
max,
seed,
diag_num,
diag_step,
diag_val,
out->mutable_value());
}
template <typename T, typename Context>
void UniformRandomSRKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
SelectedRows* out) {
phi::UniformRandomKernel<T>(
dev_ctx, shape, dtype, min, max, seed, out->mutable_value());
}
} // namespace phi
PD_REGISTER_KERNEL(uniform_random_raw_sr,
CPU,
ALL_LAYOUT,
phi::UniformRandomRawSRKernel,
float,
double,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(uniform_random_sr,
CPU,
ALL_LAYOUT,
phi::UniformRandomSRKernel,
float,
double,
phi::dtype::bfloat16) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_REGISTER_KERNEL(uniform_random_raw_sr,
GPU,
ALL_LAYOUT,
phi::UniformRandomRawSRKernel,
float,
double) {}
PD_REGISTER_KERNEL(uniform_random_sr,
GPU,
ALL_LAYOUT,
phi::UniformRandomSRKernel,
float,
double) {}
#endif
......@@ -43,18 +43,18 @@ std::vector<DenseTensor> Split(const Context& dev_ctx,
}
std::vector<MetaTensor> out_meta;
std::vector<MetaTensor*> out_meta_ptr;
out_meta.reserve(out_number);
out_meta_ptr.reserve(out_number);
std::vector<DenseTensor> result;
result.reserve(out_number);
for (size_t i = 0; i < out_number; ++i) {
auto dense_out = phi::Empty<T, Context>(dev_ctx);
MetaTensor tmp_meta(&dense_out);
result.push_back(dense_out);
out_meta.push_back(&result.back());
result.emplace_back(phi::Empty<T, Context>(dev_ctx));
out_meta.emplace_back(&result.back());
out_meta_ptr.push_back(&out_meta.back());
}
SplitInferMeta(x, num_or_sections, axis, &out_meta);
SplitInferMeta(x, num_or_sections, axis, out_meta_ptr);
std::vector<DenseTensor*> outs;
outs.reserve(out_meta.size());
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/selected_rows.h"
namespace phi {
template <typename T, typename Context>
void UniformRandomRawKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
DenseTensor* out);
template <typename T, typename Context>
void UniformRandomKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
DenseTensor* out);
template <typename T, typename Context>
void UniformRandomRawSRKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
int diag_num,
int diag_step,
float diag_val,
SelectedRows* out);
template <typename T, typename Context>
void UniformRandomSRKernel(const Context& dev_ctx,
const ScalarArray& shape,
DataType dtype,
float min,
float max,
int seed,
SelectedRows* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void WhereGradKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out_grad,
DenseTensor* x_grad,
DenseTensor* y_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void WhereKernel(const Context& ctx,
const DenseTensor& condition,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out);
} // namespace phi
......@@ -59,7 +59,7 @@ void FullKernel(const Context& dev_ctx,
const Scalar& val,
DataType dtype,
DenseTensor* out) {
out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
out->Resize(phi::make_ddim(shape.GetData()));
FullValueXPU<T>(dev_ctx, out, val.to<T>());
}
......@@ -69,6 +69,7 @@ void FullLikeKernel(const Context& dev_ctx,
const Scalar& val,
DataType dtype,
DenseTensor* out) {
dev_ctx.template Alloc<T>(out);
auto value = val.to<float>();
using XPUInTDType = typename XPUTypeTrait<T>::Type;
using CommonType = typename std::common_type<
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature IndexSampleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("index_sample_grad",
{GradVarName("Out"), "X", "Index"},
{},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(index_sample_grad,
phi::IndexSampleGradOpArgumentMapping);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature UniformRandomOpArgumentMapping(
const ArgumentMappingContext& ctx) {
int diag_num = paddle::any_cast<int>(ctx.Attr("diag_num"));
if (ctx.IsDenseTensorOutput("Out")) {
if (diag_num) {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature("uniform_random_raw",
{},
{"ShapeTensorList",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
} else {
const auto& shape =
paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("uniform_random_raw",
{},
{"ShapeTensor",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
} else {
return KernelSignature("uniform_random_raw",
{},
{"shape",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
}
}
} else {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature(
"uniform_random",
{},
{"ShapeTensorList", "dtype", "min", "max", "seed"},
{"Out"});
} else {
const auto& shape =
paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("uniform_random",
{},
{"ShapeTensor", "dtype", "min", "max", "seed"},
{"Out"});
} else {
return KernelSignature("uniform_random",
{},
{"shape", "dtype", "min", "max", "seed"},
{"Out"});
}
}
}
} else if (ctx.IsSelectedRowsOutput("Out")) {
if (diag_num) {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature("uniform_random_raw_sr",
{},
{"ShapeTensorList",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
} else {
const auto& shape =
paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("uniform_random_raw_sr",
{},
{"ShapeTensor",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
} else {
return KernelSignature("uniform_random_raw_sr",
{},
{"shape",
"dtype",
"min",
"max",
"seed",
"diag_num",
"diag_step",
"diag_val"},
{"Out"});
}
}
} else {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature(
"uniform_random_sr",
{},
{"ShapeTensorList", "dtype", "min", "max", "seed"},
{"Out"});
} else {
const auto& shape =
paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("uniform_random_sr",
{},
{"ShapeTensor", "dtype", "min", "max", "seed"},
{"Out"});
} else {
return KernelSignature("uniform_random_sr",
{},
{"shape", "dtype", "min", "max", "seed"},
{"Out"});
}
}
}
}
return KernelSignature("unregistered", {}, {}, {});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(uniform_random, phi::UniformRandomOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("where_grad",
{"Condition", "X", "Y", GradVarName("Out")},
{},
{GradVarName("X"), GradVarName("Y")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(where_grad, phi::WhereGradOpArgumentMapping);
......@@ -44,6 +44,9 @@ TEST(Backend, OStream) {
oss << phi::Backend::GPUDNN;
EXPECT_EQ(oss.str(), "GPUDNN");
oss.str("");
oss << phi::Backend::KPS;
EXPECT_EQ(oss.str(), "KPS");
oss.str("");
try {
oss << phi::Backend::NUM_BACKENDS;
} catch (const std::exception& exception) {
......@@ -61,6 +64,7 @@ TEST(Backend, StringToBackend) {
EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN"));
EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS"));
EXPECT_EQ(static_cast<phi::Backend>(
static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
pexp::StringToBackend("CustomBackend"));
......
......@@ -146,12 +146,10 @@ TEST(CustomKernel, custom_kernel_dot) {
custom_fake_dot_kernels.end());
// 3.before register
auto& kernel_factory_instance = phi::KernelFactory::Instance();
auto& kernels = phi::KernelFactory::Instance().kernels();
EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name));
EXPECT_TRUE(kernels.find(op_name) == kernels.end());
// mock fake_dot is supported by phi for HasCompatiblePhiKernel check while
// registering
// mock fake_dot is supported by phi for check while registering
auto& fake_dot_kernels = kernels[op_name];
EXPECT_TRUE(fake_dot_kernels.find(
......@@ -196,7 +194,7 @@ TEST(CustomKernel, custom_kernel_dot) {
fake_dot_kernels.end());
// 4.kernel select
auto kernel = kernel_factory_instance.SelectKernelOrThrowError(
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
op_name, phi::KernelKey(backend, layout, phi::DataType::UINT8));
// 5.prepare parameters for kernel
......
......@@ -426,9 +426,6 @@ class Quant2Int8MkldnnPass(object):
graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
graph = self._apply_pass(graph, 'conv_affine_channel_fuse_pass')
graph = self._apply_pass(graph,
'conv_eltwiseadd_affine_channel_fuse_pass')
graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass')
graph = self._apply_pass(graph,
'conv_transpose_eltwiseadd_bn_fuse_pass')
......
......@@ -560,13 +560,19 @@ class DataParallel(layers.Layer):
strategy=None,
comm_buffer_size=25,
last_comm_buffer_size=1,
find_unused_parameters=False):
find_unused_parameters=False,
process_group=None,
gradient_as_buffer_view=False,
static_graph=False):
super(DataParallel,
self).__init__(layers.full_name() + "_data_parallel")
self._layers = layers
self.find_unused_parameters = find_unused_parameters
self.grad_need_sync = True
self.process_group = process_group
self.gradient_as_buffer_view = gradient_as_buffer_view
self.static_graph = static_graph
# NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy.
# It just stores some environment variables, which can be constructed by
......
......@@ -590,7 +590,7 @@ foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL OR APPLE)
py_test_modules(test_warpctc_op MODULES test_warpctc_op)
set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
endif()
......
......@@ -144,6 +144,11 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
grad_clip = kwargs.get('grad_clip', None)
clip_after_allreduce = kwargs.get('clip_after_allreduce', True)
parameters = [p.name for p in main.all_parameters()]
exclude_fn = lambda var: var.name in parameters[::4]
kwargs['exclude_from_weight_decay_fn'] = exclude_fn
kwargs['lamb_weight_decay'] = 0.1
if use_distributed_lamb:
optimizer_class = DistributedFusedLamb
kwargs = dict(kwargs)
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from auto_scan_test import PassAutoScanTest, IgnoreReasons
from program_config import TensorConfig, ProgramConfig, OpConfig
import numpy as np
import paddle.inference as paddle_infer
from functools import partial
from typing import Optional, List, Callable, Dict, Any, Set
import unittest
import hypothesis
from hypothesis import given, settings, seed, example, assume, reproduce_failure
import hypothesis.strategies as st
class TestConvAffineChannelFusePass(PassAutoScanTest):
def is_program_valid(self, program_config: ProgramConfig) -> bool:
return True
def sample_program_config(self, draw):
padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
groups = draw(st.integers(min_value=1, max_value=3))
data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
axis = draw(st.sampled_from([1]))
filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
filter_size = draw(st.integers(min_value=1, max_value=4))
in_channel = groups * filter_channel
out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
out_channel = groups * out_channel_factor
batch_size = draw(st.integers(min_value=1, max_value=4))
dilations = draw(
st.lists(
st.integers(
min_value=1, max_value=2), min_size=2, max_size=2))
paddings = draw(
st.lists(
st.integers(
min_value=0, max_value=2), min_size=2, max_size=2))
strides = draw(
st.lists(
st.integers(
min_value=1, max_value=2), min_size=2, max_size=2))
has_bias = draw(st.booleans())
x_shape = [
batch_size, in_channel, 64, 64
] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
w_shape = [out_channel, filter_channel, filter_size, filter_size]
scale_shape = [out_channel]
bias_shape = [out_channel]
def generate_input():
return np.random.random(x_shape).astype(np.float32)
def generate_weight():
return np.random.random(w_shape).astype(np.float32)
def generate_bias():
return np.random.random(bias_shape).astype(np.float32)
def generate_scale_bias():
return np.random.random(bias_shape).astype(np.float32)
conv2d_op = OpConfig(
"conv2d",
inputs={
"Input": ["input_data"],
"Filter": ["conv2d_weight"],
},
outputs={"Output": ["conv_output"]},
data_format=data_format,
dilations=dilations,
padding_algorithm=padding_algorithm,
groups=groups,
paddings=paddings,
strides=strides,
has_bias=has_bias,
is_test=True)
ac_op = OpConfig(
"affine_channel",
inputs={
"X": ["conv_output"],
"Scale": ["affine_channel_scale"],
"Bias": ["affine_channel_bias"]
},
outputs={"Out": ["affine_channel_ouput"]},
data_layout=data_format)
if has_bias == True:
conv2d_op.inputs["Bias"] = ["conv2d_bias"]
ops = [conv2d_op, ac_op]
program_config = ProgramConfig(
ops=ops,
inputs={
"input_data": TensorConfig(data_gen=partial(generate_input)),
},
weights={
"conv2d_weight":
TensorConfig(data_gen=partial(generate_weight)),
"affine_channel_scale":
TensorConfig(data_gen=partial(generate_scale_bias)),
"affine_channel_bias":
TensorConfig(data_gen=partial(generate_scale_bias)),
},
outputs=["affine_channel_ouput"])
if has_bias == True:
program_config.weights["conv2d_bias"] = TensorConfig(
data_gen=partial(generate_bias))
return program_config
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(use_gpu=True)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
config = self.create_inference_config(use_mkldnn=True)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
def add_ignore_pass_case(self):
# If the problem has been fixed, the judgment
# in is_program_valid needs to be deleted!!!
def teller1(program_config, predictor_config):
if program_config.ops[0].attrs['data_format'] == "NHWC":
return True
return False
# mkldnn Output has diff with bias!
def teller2(program_config, predictor_config):
return predictor_config.mkldnn_enabled() and program_config.ops[
0].attrs['has_bias'] == True
self.add_ignore_check_case(
teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
"The output format of conv2d is wrong when data_format attribute is NHWC, \
because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)."
)
self.add_ignore_check_case(
teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
"Currently mkldnn Output has diff with bias!")
def test(self):
self.run_and_statis(
quant=False,
passes=["conv_affine_channel_fuse_pass"], )
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from auto_scan_test import PassAutoScanTest, IgnoreReasons
from program_config import TensorConfig, ProgramConfig, OpConfig
import numpy as np
import paddle.inference as paddle_infer
from functools import partial
from typing import Optional, List, Callable, Dict, Any, Set
import unittest
import hypothesis
from hypothesis import given, settings, seed, example, assume
import hypothesis.strategies as st
class TestConvEltwiseAddAffineChannelFusePass(PassAutoScanTest):
def is_program_valid(self, program_config: ProgramConfig) -> bool:
attrs = [
program_config.ops[i].attrs
for i in range(len(program_config.ops))
]
if attrs[0]['data_format'] == "NHWC" and attrs[1]['axis'] != 3:
return False
return True
def sample_program_config(self, draw):
padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
groups = draw(st.integers(min_value=1, max_value=3))
data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
axis = draw(st.sampled_from([1]))
filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
filter_size = draw(st.integers(min_value=1, max_value=4))
in_channel = groups * filter_channel
out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
out_channel = groups * out_channel_factor
batch_size = draw(st.integers(min_value=1, max_value=4))
dilations = draw(
st.lists(
st.integers(
min_value=1, max_value=2), min_size=2, max_size=2))
paddings = draw(
st.lists(
st.integers(
min_value=0, max_value=2), min_size=2, max_size=2))
strides = draw(
st.lists(
st.integers(
min_value=1, max_value=2), min_size=2, max_size=2))
has_bias = draw(st.booleans())
x_shape = [
batch_size, in_channel, 64, 64
] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
w_shape = [out_channel, filter_channel, filter_size, filter_size]
scale_shape = [out_channel]
bias_shape = [out_channel]
def generate_input():
return np.random.random(x_shape).astype(np.float32)
def generate_weight():
return np.random.random(w_shape).astype(np.float32)
def generate_bias():
return np.random.random(bias_shape).astype(np.float32)
def generate_scale_bias():
return np.random.random(bias_shape).astype(np.float32)
conv2d_op = OpConfig(
"conv2d",
inputs={
"Input": ["input_data"],
"Filter": ["conv2d_weight"],
},
outputs={"Output": ["conv_output"]},
data_format=data_format,
dilations=dilations,
padding_algorithm=padding_algorithm,
groups=groups,
paddings=paddings,
strides=strides,
has_bias=has_bias,
is_test=True)
eltwise_op = OpConfig(
"elementwise_add",
inputs={"X": ["conv_output"],
"Y": ["conv2d_bias"]},
outputs={"Out": ["elementwise_output"]},
axis=axis)
ac_op = OpConfig(
"affine_channel",
inputs={
"X": ["elementwise_output"],
"Scale": ["affine_channel_scale"],
"Bias": ["affine_channel_bias"]
},
outputs={"Out": ["affine_channel_ouput"]},
data_layout=data_format)
if has_bias == True:
conv2d_op.inputs["Bias"] = ["conv2d_bias"]
ops = [conv2d_op, eltwise_op, ac_op]
program_config = ProgramConfig(
ops=ops,
inputs={
"input_data": TensorConfig(data_gen=partial(generate_input)),
},
weights={
"conv2d_weight":
TensorConfig(data_gen=partial(generate_weight)),
"conv2d_bias": TensorConfig(data_gen=partial(generate_bias)),
"affine_channel_scale":
TensorConfig(data_gen=partial(generate_scale_bias)),
"affine_channel_bias":
TensorConfig(data_gen=partial(generate_scale_bias)),
},
outputs=["affine_channel_ouput"])
return program_config
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(use_gpu=True)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
config = self.create_inference_config(use_mkldnn=True)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
# TRT
config = self.create_trt_inference_config()
config.enable_tensorrt_engine(
workspace_size=1 << 20,
max_batch_size=4,
min_subgraph_size=1,
precision_mode=paddle_infer.PrecisionType.Float32,
use_static=False,
use_calib_mode=False)
yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
def add_ignore_pass_case(self):
# If the problem has been fixed, the judgment
# in is_program_valid needs to be deleted!!!
def teller1(program_config, predictor_config):
if program_config.ops[0].attrs['data_format'] == "NHWC":
return True
return False
# mkldnn Output has diff with bias!
def teller2(program_config, predictor_config):
return predictor_config.mkldnn_enabled() and program_config.ops[
0].attrs['has_bias'] == True
self.add_ignore_check_case(
teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
"The output format of conv2d is wrong when data_format attribute is NHWC, \
it will trigger Broadcast dimension mismatch bug \
when data_format attribute is NHWC and axis of eltwise op is 1 for this pass."
)
self.add_ignore_check_case(
teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
"Currently mkldnn Output has diff with bias!")
def test(self):
self.run_and_statis(
quant=False,
passes=["conv_eltwiseadd_affine_channel_fuse_pass"], )
if __name__ == "__main__":
unittest.main()
......@@ -482,6 +482,11 @@ class OpTest(unittest.TestCase):
op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
"infer datatype from inputs and outputs for this test case"
if self.is_bfloat16_op():
self.dtype = np.uint16
self.__class__.dtype = self.dtype
self.output_dtype = np.uint16
else:
self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
inputs = append_input_output(block, op_proto, self.inputs, True,
self.dtype)
......@@ -1135,7 +1140,7 @@ class OpTest(unittest.TestCase):
else:
atol = 2
else:
atol = 1e-2
atol = 1e-1
if no_check_set is not None:
if self.op_type not in no_check_set_white_list.no_check_set_white_list:
......
......@@ -55,7 +55,7 @@ class TestDiffOp(unittest.TestCase):
def test_dygraph(self):
for place in self.places:
paddle.disable_static(place)
paddle.disable_static()
x = paddle.to_tensor(self.input, place=place)
if self.prepend is not None:
self.prepend = paddle.to_tensor(self.prepend, place=place)
......
......@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
from op_test import OpTest, convert_float_to_uint16
import paddle
import paddle.fluid as fluid
from paddle.framework import core
......@@ -117,6 +117,39 @@ class TestCase6(TestGatherOp):
self.index_type = "int32"
class TestGatherBF16Op(OpTest):
def setUp(self):
self.op_type = "gather"
self.dtype = np.uint16
self.config()
xnp = np.random.random(self.x_shape).astype(np.float32)
axis_np = np.array(self.axis).astype(self.axis_type)
index_np = np.array(self.index).astype(self.index_type)
self.inputs = {
'X': convert_float_to_uint16(xnp),
'Index': index_np,
'Axis': axis_np
}
out = gather_numpy(self.inputs['X'], index_np, axis_np[0])
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', numeric_grad_delta=0.5)
def config(self):
"""
For multi-dimension input
"""
self.x_shape = (3, 88, 3)
self.index = [1, 3, 5]
self.index_type = "int32"
self.axis = [1]
self.axis_type = "int32"
class TestGatherOp1(OpTest):
def setUp(self):
self.op_type = "gather"
......
......@@ -26,159 +26,149 @@ import paddle.fluid.dygraph as dygraph
from paddle.fluid.dygraph.nn import Linear
import paddle.fluid.core as core
from paddle.fluid.optimizer import SGDOptimizer
class MLP(fluid.Layer):
def __init__(self, param_attr=None, bias_attr=None):
super(MLP, self).__init__()
self._linear1 = Linear(784, 10)
self._linear2 = Linear(10, 10)
def forward(self, inputs):
y = self._linear1(inputs)
y = self._linear2(y)
return y
from paddle.fluid.framework import _test_eager_guard
class TestDataParallelGroup(unittest.TestCase):
def create_varbase(self, dtype, shape,
type=core.VarDesc.VarType.LOD_TENSOR):
return core.VarBase(dtype, shape, "", type, True)
def create_varbase(self, dtype, shape):
return paddle.rand(shape=shape, dtype=dtype)
def assign_group_by_size(self, *args):
return core.assign_group_by_size(*args)
def test_construct_group0(self):
# one dtype & one limit capability
var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
var_list.append(
self.create_varbase(core.VarDesc.VarType.FP32, [2, 100]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
res = core.assign_group_by_size(var_list, [False, False, False, False],
var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase("float32", [2, 100]))
var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase("float32", [2, 25]))
res = self.assign_group_by_size(var_list, [False, False, False, False],
[400])
self.assertEqual([[0], [1], [2], [3]], res)
def test_construct_group1(self):
# multi dtype & one limit capability
var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
res = core.assign_group_by_size(
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
res = self.assign_group_by_size(
var_list, [False, False, False, False, False, False], [400])
self.assertEqual([[0, 2], [1, 3], [4], [5]], res)
def test_construct_group2(self):
# one dtype & multi limit capability
var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
res = core.assign_group_by_size(var_list, [False, False, False, False],
var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase("float32", [2, 50]))
res = self.assign_group_by_size(var_list, [False, False, False, False],
[400, 800])
self.assertEqual([[0], [1, 2], [3]], res)
def test_construct_group3(self):
# multi dtype & multi limit capability
var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
res = core.assign_group_by_size(
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
res = self.assign_group_by_size(
var_list, [False, False, False, False, False, False], [200, 400])
self.assertEqual([[0], [1], [2, 4], [3, 5]], res)
def test_construct_group4(self):
# multi dtype & zero limit capability
var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
res = core.assign_group_by_size(
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
res = self.assign_group_by_size(
var_list, [False, False, False, False, False, False], [0])
self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
def test_construct_group5(self):
# multi dtype & infinite capability
var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
res = core.assign_group_by_size(
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
res = self.assign_group_by_size(
var_list, [False, False, False, False, False, False], [10000])
self.assertEqual([[0, 2, 4], [1, 3, 5]], res)
def test_construct_group6(self):
# multi dtype & limit capability & multi tensor type
var_list = []
var_list.append(
self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
core.VarDesc.VarType.SELECTED_ROWS))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(
self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
core.VarDesc.VarType.SELECTED_ROWS))
res = core.assign_group_by_size(
var_list.append(self.create_varbase(
"float32",
[1, 50], ))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
res = self.assign_group_by_size(
var_list, [True, False, False, False, False, True], [400])
self.assertEqual([[0], [1, 3], [2, 4], [5]], res)
def test_construct_group7(self):
# multi dtype & multi limit capability & multi tensor type
var_list = []
var_list.append(
self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
core.VarDesc.VarType.SELECTED_ROWS))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
var_list.append(
self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
core.VarDesc.VarType.SELECTED_ROWS))
res = core.assign_group_by_size(
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
var_list.append(self.create_varbase("float32", [1, 50]))
var_list.append(self.create_varbase("float64", [1, 25]))
res = self.assign_group_by_size(
var_list, [True, False, False, False, False, True], [200, 400])
self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
def test_construct_group8(self):
# one dtype & one limit capability & have tensor_indices
var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
var_list.append(
self.create_varbase(core.VarDesc.VarType.FP32, [2, 100]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
res = core.assign_group_by_size(var_list, [False, False, False, False],
var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append(self.create_varbase("float32", [2, 100]))
var_list.append(self.create_varbase("float32", [2, 50]))
var_list.append(self.create_varbase("float32", [2, 25]))
res = self.assign_group_by_size(var_list, [False, False, False, False],
[400], [3, 0, 1, 2])
self.assertEqual([[3, 0], [1], [2]], res)
def test_construct_group9(self):
# one dtype & one limit capability & have tensor_indices
var_list = []
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
var_list.append(
self.create_varbase(core.VarDesc.VarType.FP32, [2, 1000]))
res = core.assign_group_by_size(var_list, [False, False, False, True],
var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append(self.create_varbase("float32", [2, 25]))
var_list.append(self.create_varbase("float32", [2, 1000]))
res = self.assign_group_by_size(var_list, [False, False, False, True],
[300], [1, 0, 2, 3])
self.assertEqual([[1, 0], [3], [2]], res)
class TestDataParallelGroupEager(TestDataParallelGroup):
def create_varbase(self, dtype, shape):
with _test_eager_guard():
return paddle.rand(shape=shape, dtype=dtype)
def assign_group_by_size(self, *args):
return core.eager_assign_group_by_size(*args)
if __name__ == '__main__':
unittest.main()
......@@ -375,6 +375,53 @@ class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
assert_equal(b_g_np_1, b_g_np_2)
class TestBF16ScaleBiasLayerNorm(unittest.TestCase):
def check_main(self, x_np, weight_np, bias_np, dtype):
paddle.disable_static()
x = paddle.to_tensor(x_np)
weight = paddle.to_tensor(weight_np)
bias = paddle.to_tensor(bias_np)
if dtype == "bfloat16":
x = x.cast(paddle.fluid.core.VarDesc.VarType.BF16)
x.stop_gradient = False
weight.stop_gradient = False
bias.stop_gradient = False
y = F.layer_norm(x, x.shape[1:], weight, bias)
x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
y_np = y.cast('float32').numpy()
x_g_np = x_g.cast('float32').numpy()
w_g_np = w_g.cast('float32').numpy()
b_g_np = b_g.cast('float32').numpy()
paddle.enable_static()
return y_np, x_g_np, w_g_np, b_g_np
def test_main(self):
if (not core.is_compiled_with_cuda()) or (core.cudnn_version() < 8100):
return
x_np = np.random.random([10, 20]).astype('float32')
weight_np = np.random.random([20]).astype('float32')
bias_np = np.random.random([20]).astype('float32')
y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
x_np, weight_np, bias_np, 'float32')
y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
x_np, weight_np, bias_np, 'bfloat16')
def assert_equal(x, y):
self.assertTrue(np.allclose(x, y, atol=1.e-1))
assert_equal(y_np_1, y_np_2)
assert_equal(x_g_np_1, x_g_np_2)
assert_equal(w_g_np_1, w_g_np_2)
assert_equal(b_g_np_1, b_g_np_2)
class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
def test_main(self):
self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
......
......@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
from op_test import OpTest, convert_float_to_uint16
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
......@@ -282,6 +282,80 @@ class TestPnormOpFP161(TestPnormOpFP16):
self.asvector = True
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestPnormBF16Op(OpTest):
def setUp(self):
self.op_type = "p_norm"
self.init_test_case()
self.x = (np.random.random(self.shape) + 0.5).astype(np.float32)
self.norm = p_norm(self.x, self.axis, self.porder, self.keepdim,
self.asvector)
self.gradient = self.calc_gradient()
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.attrs = {
'epsilon': self.epsilon,
'axis': self.axis,
'keepdim': self.keepdim,
'porder': float(self.porder),
'asvector': self.asvector
}
self.outputs = {'Out': convert_float_to_uint16(self.norm)}
def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-3)
def test_check_grad(self):
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', user_defined_grads=self.gradient)
def init_test_case(self):
self.shape = [2, 3, 4, 5]
self.axis = 1
self.epsilon = 1e-12
self.porder = 2.0
self.keepdim = False
self.dtype = np.uint16
self.asvector = False
def calc_gradient(self):
self.attrs = {
'epsilon': self.epsilon,
'axis': self.axis,
'keepdim': self.keepdim,
'porder': float(self.porder),
'asvector': self.asvector
}
x = self.x
porder = self.attrs["porder"]
axis = self.attrs["axis"]
asvector = self.attrs["asvector"]
x_dtype = x.dtype
x = x.astype(np.float32) if x.dtype == np.float16 else x
if porder == 0:
grad = np.zeros(x.shape).astype(x.dtype)
elif porder in [float("inf"), float("-inf")]:
norm = p_norm(
x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
x_abs = np.abs(x)
grad = np.sign(x)
grad[x_abs != norm] = 0.0
else:
norm = p_norm(
x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
grad = np.power(norm, 1 - porder) * np.power(
np.abs(x), porder - 1) * np.sign(x)
numel = 1
for s in x.shape:
numel *= s
divisor = numel if asvector else x.shape[axis]
numel /= divisor
return [grad.astype(x_dtype) * 1 / numel]
def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
with fluid.program_guard(fluid.Program()):
data = fluid.data(name="X", shape=shape_x, dtype=dtype)
......
......@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest, skip_check_grad_ci
from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
......@@ -61,6 +61,37 @@ class TestSumOp_fp16(OpTest):
self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSumOp_bf16(OpTest):
def setUp(self):
np.random.seed(100)
self.op_type = "reduce_sum"
self.dtype = np.uint16
self.x = np.random.uniform(0, 0.1, (2, 5, 10)).astype(np.float32)
self.attrs = {'dim': [0, 1, 2]}
self.out = self.x.sum(axis=tuple(self.attrs['dim']))
self.gradient = self.calc_gradient()
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': convert_float_to_uint16(self.out)}
self.gradient = self.calc_gradient()
def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place)
def test_check_grad(self):
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ['X'], 'Out', user_defined_grads=self.gradient)
def calc_gradient(self):
x = self.x
grad = np.ones(x.shape, dtype=x.dtype)
return [grad]
class TestSumOp_fp16_withInt(OpTest):
def setUp(self):
self.op_type = "reduce_sum"
......
......@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
from op_test import OpTest, convert_float_to_uint16
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
......@@ -153,6 +153,23 @@ class TestScaleFp16Op(TestScaleOp):
place, ["X"], "Out", max_relative_error=0.05)
class TestScaleBF16Op(OpTest):
def setUp(self):
self.op_type = "scale"
self.dtype = np.uint16
self.attrs = {'scale': -2.3}
x = np.random.random((10, 10)).astype(np.float32)
out = x * np.float32(self.attrs['scale'])
self.inputs = {'X': convert_float_to_uint16(x)}
self.outputs = {'Out': convert_float_to_uint16(out)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out', numeric_grad_delta=0.8)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
......
......@@ -298,6 +298,32 @@ def create_test_sum_fp16_class(parent):
globals()[cls_name] = TestSumFp16Case
#----------- test bf16 -----------
class TestSumBF16Op(OpTest):
def setUp(self):
self.op_type = "sum"
self.init_kernel_type()
x0 = np.random.random((3, 40)).astype(np.float32)
x1 = np.random.random((3, 40)).astype(np.float32)
x2 = np.random.random((3, 40)).astype(np.float32)
y = x0 + x1 + x2
self.inputs = {
"X": [("x0", convert_float_to_uint16(x0)),
("x1", convert_float_to_uint16(x1)),
("x2", convert_float_to_uint16(x2))]
}
self.outputs = {'Out': convert_float_to_uint16(y)}
def init_kernel_type(self):
self.dtype = np.uint16
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['x0'], 'Out', numeric_grad_delta=0.5)
class API_Test_Add_n(unittest.TestCase):
def test_api(self):
with fluid.program_guard(fluid.Program(), fluid.Program()):
......
......@@ -171,10 +171,7 @@ class DistributedFusedLamb(Optimizer):
moment2.is_distributed = True
beta1pow = self._create_persistable_var('beta1pow')
beta2pow = self._create_persistable_var('beta2pow')
fused_indices = self._create_persistable_var(
'fused_indices', dtype='int32')
weight_decay = self._create_persistable_var('weight_decay')
weight_decay.is_distributed = True
param_info = self._create_persistable_var('param_info', dtype='int32')
param_info.is_distributed = True
......@@ -189,17 +186,20 @@ class DistributedFusedLamb(Optimizer):
'fp16_partial_fused_offsets', dtype='int32')
fp16_partial_fused_offsets.is_distributed = True
param_order = self._create_persistable_var('param_order', dtype='int32')
param_order.is_distributed = True
rank = get_rank()
nranks = get_world_size()
scale = self._get_or_create_scale()
params = [p for p, _ in params_grads]
grads = [g for _, g in params_grads]
weight_decay_values = [self._weight_decay] * len(params)
apply_weight_decay = [1] * len(params)
if self._exclude_from_weight_decay_fn is not None:
for i, p in enumerate(params):
if self._exclude_from_weight_decay_fn(p):
weight_decay_values[i] = 0.0
apply_weight_decay[i] = 0
startup_block = self.helper.startup_program.global_block()
for g in grads:
......@@ -225,8 +225,6 @@ class DistributedFusedLamb(Optimizer):
'Moment2': [moment2],
'Beta1Pow': [beta1pow],
'Beta2Pow': [beta2pow],
'FusedIndices': [fused_indices],
'WeightDecay': [weight_decay],
'GlobalScale': [scale],
'ParamInfo': [param_info],
'ParamOut': params,
......@@ -235,12 +233,13 @@ class DistributedFusedLamb(Optimizer):
'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
'FusedParamOffsets': [fused_offsets],
'ParamOrder': [param_order],
},
attrs={
'alignment': self._alignment,
'rank': rank,
'nranks': nranks,
'weight_decay': weight_decay_values,
'apply_weight_decay': apply_weight_decay,
'moment1': 0.0,
'moment2': 0.0,
'beta1': self._beta1,
......@@ -272,8 +271,6 @@ class DistributedFusedLamb(Optimizer):
'Moment2': [moment2],
'Beta1Pow': [beta1pow],
'Beta2Pow': [beta2pow],
'FusedIndices': [fused_indices],
'WeightDecay': [weight_decay],
'GlobalScale': [scale],
'ParamInfo': [param_info],
'Param': params,
......@@ -281,6 +278,7 @@ class DistributedFusedLamb(Optimizer):
'FusedParamOffsets': [fused_offsets],
'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
'ParamOrder': [param_order],
},
outputs={
'FP32FusedParamOut': [fp32_fused_param],
......@@ -294,6 +292,7 @@ class DistributedFusedLamb(Optimizer):
'FoundInf': [self._found_inf],
},
attrs={
'weight_decay': self._weight_decay,
'beta1': self._beta1,
'beta2': self._beta2,
'epsilon': self._epsilon,
......
......@@ -1667,11 +1667,11 @@ def cross_entropy(input,
label_min = paddle.min(valid_label)
label_max = paddle.max(valid_label)
if label_min < 0:
raise ValueError("label should not out of bound, but got{}".
format(label_min))
raise ValueError("Target {} is out of lower bound.".format(
label_min.item()))
if label_max >= input.shape[axis]:
raise ValueError("label should not out of bound, but got{}".
format(label_max))
raise ValueError("Target {} is out of upper bound.".format(
label_max.item()))
if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
_, _, out = _C_ops.softmax_with_cross_entropy(
input, label, 'soft_label', soft_label, 'ignore_index',
......
......@@ -451,7 +451,20 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
param_code = ""
for param in infer_meta_params:
if param in input_names:
if param in self.optional_vars:
if self.inputs['input_info'][param] == "const Tensor&":
param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
elif self.inputs['input_info'][
param] == "const std::vector<Tensor>&":
meta_tensor_code = meta_tensor_code + f"""
{code_indent} auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param});
{code_indent} std::vector<phi::MetaTensor*> {param}_metas({param}_meta_vec.size());
{code_indent} for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{
{code_indent} {param}_metas[i] = &{param}_meta_vec[i];
{code_indent} }}
"""
param_code = param_code + param + "_metas, "
elif param in self.optional_vars:
meta_tensor_code = meta_tensor_code + f"""
{code_indent} paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none);
{code_indent} auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
......@@ -461,7 +474,9 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, "
else:
param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
raise ValueError(
f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported."
)
elif param in kernel_output_names:
meta_tensor_code = meta_tensor_code + code_indent + " phi::MetaTensor " + param.replace(
'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n"
......
......@@ -106,7 +106,7 @@ function prepare_benchmark_environment {
[ $? -ne 0 ] && LOG "[FATAL] Clone benchmark repo fail." && exit -1
LOG "[INFO] Collect api info ..."
python benchmark/api/deploy/collect_api_info.py \
--test_module_name tests_v2 \
--test_module_name dynamic_tests_v2 \
--info_file api_info.txt >& 2
[ $? -ne 0 ] && LOG "[FATAL] Collect api info fail." && exit -1
[ ! -f benchmark/ci/scripts/op_benchmark.config ] && LOG "[FATAL] Missing op_benchmark.config!" && exit -1
......@@ -185,7 +185,7 @@ function run_op_benchmark_test {
logs_dir="$(pwd)/logs-${branch_name}"
[ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir
pushd benchmark/api > /dev/null
bash deploy/main_control.sh tests_v2 \
bash deploy/main_control.sh dynamic_tests_v2 \
tests_v2/configs \
$logs_dir \
$VISIBLE_DEVICES \
......@@ -212,7 +212,7 @@ function check_op_benchmark_result {
# there is no need to recompile and install paddle
LOG "[INFO] retry ${retry_time} times ..."
pushd benchmark/api > /dev/null
bash deploy/main_control.sh tests_v2 \
bash deploy/main_control.sh dynamic_tests_v2 \
tests_v2/configs \
${logs_dir} \
$VISIBLE_DEVICES \
......
......@@ -958,7 +958,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', 'test_pow',
'test_inplace_softmax_with_cross_entropy', 'test_transforms',
'test_unfold_op', 'test_assign_op', 'test_isinstance',
'test_conv_affine_channel_fuse_pass',
'auto_growth_best_fit_allocator_facade_test', 'test_cholesky_op',
'test_adaptive_avg_pool3d', 'test_paddle_save_load_binary',
'test_fused_fc_elementwise_layernorm_op', 'test_sequence_enumerate_op',
......@@ -1873,7 +1872,6 @@ TETRAD_PARALLEL_JOB = [
'test_dataloader_unkeep_order',
'test_parallel_executor_profiler',
'test_correlation',
'test_conv_affine_channel_fuse_pass',
'test_ir_inplace_pass',
'test_moving_average_abs_max_scale_op',
'test_flatten_contiguous_range_op',
......
......@@ -578,7 +578,6 @@ STATIC_MODE_TESTING_LIST = [
'test_ir_embedding_eltwise_layernorm_fuse_pass',
'test_ir_fc_fuse_pass',
'test_ir_skip_layernorm_pass',
'test_conv_affine_channel_fuse_pass',
'test_conv_bias_mkldnn_fuse_pass',
'test_conv_bn_fuse_pass',
'test_conv_elementwise_add2_act_fuse_pass',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册