提交 a4bccde0 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_sgd_to_phi

......@@ -36,7 +36,7 @@ ENDIF()
if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
......
......@@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME)
else()
xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
find_fluid_modules(${TARGET_NAME})
find_phi_modules(${TARGET_NAME})
endif()
if (xpu_library_DEPS)
add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
......
......@@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST)
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./gpudnn\/")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./kps\/")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
else ()
# deal with device independent kernel, now we use CPU temporaary
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
......@@ -97,6 +99,7 @@ function(kernel_library TARGET)
set(gpu_srcs)
set(xpu_srcs)
set(gpudnn_srcs)
set(kps_srcs)
set(selected_rows_srcs)
# parse and save the deps kerenl targets
set(all_srcs)
......@@ -128,6 +131,9 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
endif()
......@@ -137,6 +143,15 @@ function(kernel_library TARGET)
list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
endif()
endif()
if (WITH_XPU_KP)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
# Change XPU2 file suffix
# NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps)
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
endif()
endif()
else()
# TODO(chenweihang): impl compile by source later
endif()
......@@ -150,6 +165,7 @@ function(kernel_library TARGET)
list(APPEND all_srcs ${gpu_srcs})
list(APPEND all_srcs ${xpu_srcs})
list(APPEND all_srcs ${gpudnn_srcs})
list(APPEND all_srcs ${kps_srcs})
foreach(src ${all_srcs})
file(READ ${src} target_content)
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
......@@ -176,11 +192,20 @@ function(kernel_library TARGET)
list(LENGTH gpu_srcs gpu_srcs_len)
list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH gpudnn_srcs gpudnn_srcs_len)
list(LENGTH kps_srcs kps_srcs_len)
list(LENGTH selected_rows_srcs selected_rows_srcs_len)
# kernel source file level
# level 1: base device kernel
# - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
# level 2: device-independent kernel
# - common_srcs
# level 3: Kernel implemented by reusing device-independent kernel
# - selected_rows_srcs
# Build Target according different src organization
if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND
${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
(${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
# If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
if (WITH_GPU)
......@@ -193,6 +218,11 @@ function(kernel_library TARGET)
hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
elseif (WITH_XPU_KP)
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
......@@ -200,7 +230,7 @@ function(kernel_library TARGET)
endif()
endif()
# If there are only specific device srcs, build target using this rule.
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
......@@ -209,6 +239,10 @@ function(kernel_library TARGET)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
elseif (WITH_XPU_KP)
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
......@@ -222,6 +256,9 @@ function(kernel_library TARGET)
elseif (WITH_ROCM)
hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
else()
cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
......@@ -232,6 +269,8 @@ function(kernel_library TARGET)
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else()
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
......@@ -240,6 +279,8 @@ function(kernel_library TARGET)
nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else()
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
......@@ -249,7 +290,7 @@ function(kernel_library TARGET)
if (${target_build_flag} EQUAL 1)
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
# append target into PHI_KERNELS property
get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
......@@ -275,6 +316,9 @@ function(kernel_library TARGET)
if (${gpudnn_srcs_len} GREATER 0)
kernel_declare(${gpudnn_srcs})
endif()
if (${kps_srcs_len} GREATER 0)
kernel_declare(${kps_srcs})
endif()
if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs})
endif()
......
cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
if(WITH_NCCL)
cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/reducer.h"
#include "paddle/phi/common/data_type.h"
namespace paddle {
namespace distributed {
std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
const std::vector<Tensor> tensors,
const std::vector<bool> &is_sparse_gradient,
const std::vector<size_t> &group_size_limits,
const std::vector<int64_t> &tensor_indices) {
PADDLE_ENFORCE_EQ(
tensors.size(), is_sparse_gradient.size(),
platform::errors::PreconditionNotMet(
"tensors len must be equal to is_sparse_gradient len, but "
"[%lu] != [%lu]",
tensors.size(), is_sparse_gradient.size()));
auto check_perm = [](const std::vector<int64_t> &x) -> bool {
size_t len = x.size();
std::vector<size_t> cnt(len, 0);
for (size_t i = 0; i < len; ++i) {
if (x[i] >= static_cast<int64_t>(len) || x[i] < 0 || cnt[x[i]]) {
return false;
}
cnt[x[i]]++;
}
return true;
};
PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices),
platform::errors::PreconditionNotMet(
"tensor_indices must be a permutation from 0 to %lu",
tensor_indices.size()));
// the return vector
std::vector<std::vector<size_t>> res;
// Key: the var type
// Value: should use which index in group_size_limits for group size limit
std::map<experimental::DataType, size_t> group_limit_index;
// Key: the var type
// Value: <the var index in input tensors, total numel in this group>
std::map<experimental::DataType, std::pair<std::vector<size_t>, size_t>>
next_group;
for (size_t i = 0; i < tensors.size(); ++i) {
const auto &var = tensors[i];
size_t tensor_real_index = i;
if (!tensor_indices.empty()) {
tensor_real_index = tensor_indices[i];
}
if (is_sparse_gradient[tensor_real_index]) {
// we keep sparse var a single group
res.push_back({tensor_real_index});
continue;
}
const auto &var_dtype = var.dtype();
VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype;
auto &group_info = next_group[var_dtype];
int64_t var_size = -1;
if (var.is_dense_tensor()) {
var_size =
std::dynamic_pointer_cast<phi::DenseTensor>(var.impl())->numel();
} else {
VLOG(3) << "var " << var.name()
<< " is not tensor or selected_rows, so skip it";
continue;
}
group_info.first.push_back(tensor_real_index);
group_info.second += experimental::SizeOf(var_dtype) * var_size;
// group_info.second += framework::SizeOfType(var_dtype) * var_size;
if (group_limit_index.find(var_dtype) == group_limit_index.end()) {
// means it is the first var of var_dtype
group_limit_index[var_dtype] = 0;
}
auto &cur_limit_index = group_limit_index[var_dtype];
if (group_info.second >= group_size_limits[cur_limit_index]) {
// exceed group capacity and create a new group
res.emplace_back(std::move(group_info.first));
group_info = std::pair<std::vector<size_t>, size_t>();
cur_limit_index =
(std::min)(cur_limit_index + 1, group_size_limits.size() - 1);
}
}
// add the final groups
for (auto &e : next_group) {
auto &group_info = e.second;
if (!group_info.first.empty()) {
res.emplace_back(std::move(group_info.first));
}
}
for (const auto &group_index : res) {
PADDLE_ENFORCE_NE(
group_index.empty(), true,
platform::errors::PreconditionNotMet(
"AssignGroupBySize construct empty group, please check."));
}
if (tensor_indices.empty()) {
std::sort(res.begin(), res.end(),
[](const std::vector<size_t> &x, const std::vector<size_t> &y) {
return x.front() < y.front();
});
}
return res;
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,41 +14,19 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include <map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the Conv and ConvAffineChannel.
*/
class Graph;
class ConvAffineChannelFusePass : public FusePassBase {
public:
ConvAffineChannelFusePass();
virtual ~ConvAffineChannelFusePass() {}
protected:
void ApplyImpl(ir::Graph*) const override;
const std::string name_scope_{"conv_affine_channel_fuse"};
};
class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
public:
ConvEltwiseAddAffineChannelFusePass();
virtual ~ConvEltwiseAddAffineChannelFusePass() {}
namespace distributed {
using Tensor = paddle::experimental::Tensor;
protected:
void ApplyImpl(ir::Graph*) const override;
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
};
std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
const std::vector<size_t>& group_size_limits,
const std::vector<int64_t>& tensor_indices = {});
} // namespace ir
} // namespace framework
} // namespace distributed
} // namespace paddle
......@@ -308,22 +308,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
// TODO(chenweihang): support multiple inputs and outputs later
phi::InferMetaContext infer_mete_context;
for (auto& in_name : input_names) {
if (ctx->HasInput(in_name)) {
infer_meta_context.EmplaceBackInput(std::make_shared<CompatMetaTensor>(
ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime()));
if (ctx->HasInputs(in_name)) {
auto input_var = ctx->GetInputVarPtrs(in_name);
if (input_var.size() == 1) {
infer_meta_context.EmplaceBackInput(
std::make_shared<CompatMetaTensor>(input_var[0], ctx->IsRuntime()));
} else {
infer_meta_context.EmplaceBackInput({nullptr});
paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> inputs;
inputs.reserve(input_var.size());
for (const auto& in : input_var) {
inputs.push_back(
std::make_shared<CompatMetaTensor>(in, ctx->IsRuntime()));
}
infer_meta_context.EmplaceBackInputs(std::move(inputs));
}
for (auto& out_name : output_names) {
if (ctx->HasOutput(out_name)) {
infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
} else {
infer_meta_context.EmplaceBackOutput({nullptr});
infer_meta_context.EmplaceBackInput({nullptr});
}
}
auto attr_reader = ctx->Attrs();
for (size_t i = 0; i < attr_names.size(); ++i) {
auto attr_name = attr_names[i];
......@@ -348,13 +351,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
}
} else {
// If is not in runtime, we will set default value(-1) for ScalarArray
int64_t num_ele = 0;
std::vector<VarDesc*> vars;
vars.reserve(infershape_inputs.size());
for (size_t i = 0; i < infershape_inputs.size(); i++) {
for (size_t i = 0; i < infershape_inputs.size(); ++i) {
vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
}
int64_t num_ele = 0;
if (vars.size() == 1) {
num_ele = 1;
const auto& tensor_dims = vars[0]->GetShape();
......@@ -362,16 +365,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
num_ele *= tensor_dims[i];
}
} else {
for (auto& var : vars) {
const auto& tensor_dims = var->GetShape();
PADDLE_ENFORCE_EQ(tensor_dims.size(), 1,
platform::errors::InvalidArgument(
"The shape is constructed by multi-tensor, "
"every tensor's dims should be 1. But your "
"shape has tensor that dims is %s.",
tensor_dims.size()));
num_ele += tensor_dims[0];
}
num_ele = vars.size();
}
phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
tensor_attr.SetFromTensor(true);
......@@ -383,10 +377,14 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
std::type_index(typeid(std::vector<int32_t>))) {
infer_meta_context.EmplaceBackAttr(std::move(
phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
} else if (std::type_index(attr.type()) ==
std::type_index(typeid(int))) {
infer_meta_context.EmplaceBackAttr(
phi::ScalarArray({BOOST_GET_CONST(int, attr)}));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported cast op attribute `%s` to ScalarArray when "
"construct KernelContext.",
"construct InferMetaContext.",
attr_name));
}
}
......@@ -414,7 +412,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
}
} else if (ctx->HasInput(attr_name)) {
const auto& infershape_input = ctx->GetInputVarPtrs(attr_name);
if (infershape_input.size() == 1) {
if (ctx->IsRuntime()) {
Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
......@@ -490,6 +487,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
"Unsupported attribute type is received when call "
"InferShapeFunctor."));
}
} else {
// do nothing
}
}
for (auto& out_name : output_names) {
if (ctx->HasOutputs(out_name)) {
auto output_var = ctx->GetOutputVarPtrs(out_name);
if (output_var.size() == 1) {
infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
output_var[0], ctx->IsRuntime()));
} else {
paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> outputs;
outputs.reserve(output_var.size());
for (const auto& out : output_var) {
outputs.emplace_back(
std::make_shared<CompatMetaTensor>(out, ctx->IsRuntime()));
}
infer_meta_context.EmplaceBackOutputs(std::move(outputs));
}
} else {
infer_meta_context.EmplaceBackOutput({nullptr});
}
}
......
......@@ -78,7 +78,6 @@ pass_library(is_test_pass base)
pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base)
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
#include <cmath>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace ir {
class Node;
#define GET_CONV_BN_NODES(pattern_name) \
/* OPERATORS */ \
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
/* CONV inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \
/* CONV outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \
/* Affine Channel inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \
/* Affine channel outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
const ir::Node& ac_scale,
const LoDTensor& ac_bias_tensor,
LoDTensor* eltwise_y_in_tensor) {
using EigenVectorArrayMap =
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using EigenMatrixArrayMap = Eigen::Map<
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
// Re-compute bias of conv2d from AffineChannel
PADDLE_ENFORCE_EQ(
eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
platform::errors::InvalidArgument(
"Tensor elementwise y(%d) and activation bias(%d) must have same "
"dimension.",
eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
scale_tensor->numel(), 1);
ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
ac_bias_tensor.numel(), 1);
EigenVectorArrayMap eltwise_y_in_array(
eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 1);
eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
// Re-compute weight of conv2d from AffineChannel
auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
auto weights_shape = weights->dims();
auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
weights_shape_2d[1]);
weights_array_2d.colwise() *= scale_array;
// Check for subnormal values that slows down convolution execution
for (int i = 0; i < weights->numel(); ++i) {
if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
}
}
ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
AddOpCompat(OpCompat("conv2d"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("Filter")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddInput("ResidualData")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Output")
.IsTensor()
.End()
.AddAttr("strides")
.IsType<std::vector<int>>()
.End()
.AddAttr("paddings")
.IsType<std::vector<int>>()
.End()
.AddAttr("padding_algorithm")
.IsOptional()
.IsStringIn({"EXPLICIT", "SAME", "VALID"})
.End()
.AddAttr("groups")
.IsNumGE(1)
.End()
.AddAttr("dilations")
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("affine_channel"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Scale")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("data_layout")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("axis")
.IsNumEQ(1)
.End();
}
void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
FusePassBase::Init(name_scope_, graph);
auto* scope = param_scope();
PADDLE_ENFORCE_NOT_NULL(
scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
if (!IsCompat(subgraph, g)) {
LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed.";
return;
}
VLOG(4) << "handle ConvAffineChannel fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
if (data_format == "AnyLayout") {
LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
"it's wrong if data_format of conv is not "
"NCHW.";
}
// Get affine_channel bias for resizing eltwise_y!
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
// Create eltwise_y (conv bias) variable
VarDesc eltwise_y_in_desc(
patterns::PDNodeName(name_scope_, "eltwise_y_in"));
// Set shape && datatype manually
eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
eltwise_y_in_desc.SetDataType(
framework::TransToProtoVarType(ac_bias_tensor->dtype()));
eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
eltwise_y_in_desc.SetPersistable(true);
// Initialize eltwise_y
auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
auto* eltwise_y_in_tensor =
scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 0.0f);
// update weights and biases
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// create an elementwise add node.
OpDesc desc;
desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
desc.SetType("elementwise_add");
desc.SetAttr("axis", 1);
desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
IR_NODE_LINK_TO(conv_out, eltwise_op);
IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
IR_NODE_LINK_TO(eltwise_op, ac_out);
found_conv_ac_count++;
};
gpd(graph, handler);
AddStatis(found_conv_ac_count);
}
ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
AddOpCompat(OpCompat("conv2d"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("Filter")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddInput("ResidualData")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Output")
.IsTensor()
.End()
.AddAttr("strides")
.IsType<std::vector<int>>()
.End()
.AddAttr("paddings")
.IsType<std::vector<int>>()
.End()
.AddAttr("padding_algorithm")
.IsOptional()
.IsStringIn({"EXPLICIT", "SAME", "VALID"})
.End()
.AddAttr("groups")
.IsNumGE(1)
.End()
.AddAttr("dilations")
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("affine_channel"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Scale")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("data_layout")
.IsStringIn({"NCHW", "AnyLayout"})
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("axis")
.IsNumEQ(1)
.End();
}
void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
FusePassBase::Init(name_scope_, graph);
auto* scope = param_scope();
PADDLE_ENFORCE_NOT_NULL(
scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
if (!IsCompat(subgraph, g)) {
LOG(WARNING)
<< "ConvEltwiseAddAffineChannelFusePass in op compat failed.";
return;
}
VLOG(4) << "handle ConvBN fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
if (data_format == "AnyLayout") {
LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is "
"enabled, it's wrong if data_format of conv "
"is not NCHW.";
}
// OPERATORS
GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
// BIAS inputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
// BIAS outputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
// Get eltwise_y (conv bias) variable
auto* eltwise_y_in_tensor =
scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
// Get batch norm bias
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// Update the elementwise_add node
eltwise->Op()->SetAttr("axis", 1);
eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
GraphSafeRemoveNodes(graph,
{ac_scale, ac_bias, affine_channel, eltwise_out});
IR_NODE_LINK_TO(eltwise, ac_out);
found_conv_ac_count++;
};
gpd(graph, handler);
AddStatis(found_conv_ac_count);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_affine_channel_fuse_pass,
paddle::framework::ir::ConvAffineChannelFusePass);
REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.LE("conv2d", 1)
.EQ("affine_channel", 0));
REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.LE("conv2d", 1)
.LE("elementwise_add", 1)
.EQ("affine_channel", 0));
......@@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
}
pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
}
VLOG(4) << "Done inputs";
for (size_t i = 0; i < output_names.size(); ++i) {
auto it = ctx.outputs.find(output_names[i]);
......@@ -2107,17 +2108,12 @@ void OperatorWithKernel::BuildPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type())));
}
experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
output_defs.at(i));
SetAllocationForOutputTenosr(
tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
}
pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
}
VLOG(4) << "Done outputs";
for (size_t i = 0; i < attr_names.size(); ++i) {
if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
......@@ -2226,6 +2222,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
}
}
}
VLOG(4) << "Done attributes";
}
} // namespace framework
......
......@@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
library_type = LibraryType::kMKLDNN;
} else if (kernel_key.backend() == phi::Backend::GPUDNN) {
library_type = LibraryType::kCUDNN;
} else if (kernel_key.backend() == phi::Backend::KPS) {
library_type = LibraryType::kKP;
} else {
// do nothing
}
......@@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey(
backend = phi::Backend::MKLDNN;
} else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
backend = phi::Backend::GPUDNN;
} else if (kernel_type.library_type_ == LibraryType::kKP) {
backend = phi::Backend::KPS;
} else {
// do
}
......@@ -229,26 +233,5 @@ static void SetAllocationForUninitializedDenseTensor(
dense_tensor->ResetHolder(shared_allocation);
}
void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
const platform::Place& place) {
if (phi::DenseTensor::classof(tensor)) {
auto* dense_tensor = static_cast<phi::DenseTensor*>(tensor);
if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) {
SetAllocationForUninitializedDenseTensor(dense_tensor, place);
}
} else if (phi::SelectedRows::classof(tensor)) {
auto* selected_rows = static_cast<phi::SelectedRows*>(tensor);
if (!selected_rows->value().IsInitialized() ||
!(selected_rows->place() == place)) {
SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(),
place);
}
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported tensor type is received when setting allocation for "
"output tensor."));
}
}
} // namespace framework
} // namespace paddle
......@@ -62,9 +62,6 @@ class KernelArgsNameMaker {
void InitDefaultKernelSignatureMap();
void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
const platform::Place& place);
// TODO(Wilber): support others device context.
template <typename T>
struct ConvertToPhiContext {
......
......@@ -323,12 +323,6 @@ void BuildDygraphPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type())));
}
experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
output_defs.at(i));
framework::SetAllocationForOutputTenosr(
tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
}
kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
......
......@@ -75,9 +75,7 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
const std::vector<std::string> kTRTSubgraphPasses({
"conv_affine_channel_fuse_pass", //
"adaptive_pool2d_convert_global_pass",
"conv_eltwiseadd_affine_channel_fuse_pass", //
"shuffle_channel_detect_pass", //
"quant_conv2d_dequant_fuse_pass", //
"delete_quant_dequant_op_pass", //
......@@ -136,8 +134,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
// "identity_scale_op_clean_pass", //
"is_test_pass", //
"simplify_with_basic_ops_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"embedding_eltwise_layernorm_fuse_pass", //
......@@ -239,8 +235,6 @@ void CpuPassStrategy::EnableMKLDNN() {
"depthwise_conv_mkldnn_pass", //
"conv_bn_fuse_pass", // Execute BN passes again to
"conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_transpose_bn_fuse_pass", //
"conv_transpose_eltwiseadd_bn_fuse_pass", //
"conv_bias_mkldnn_fuse_pass", //
......
......@@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext;
ops::CastOpKernel<CUDA, plat::complex<float>>, \
ops::CastOpKernel<CUDA, plat::complex<double>>, ##__VA_ARGS__);
#if !defined(PADDLE_WITH_HIP)
// See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel<CUDA, plat::bfloat16>)
#else
REGISTER_CAST_CUDA_BASE(transfer_dtype)
#endif
......@@ -18,7 +18,9 @@ limitations under the License. */
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/multiary.h"
#include "paddle/phi/kernels/funcs/concat_funcs.h"
#ifdef PADDLE_WITH_MKLDNN
......@@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat");
auto inputs_dims = ctx->GetInputsDim("X");
const size_t inputs_num = inputs_dims.size();
PADDLE_ENFORCE_GT(
inputs_num, static_cast<size_t>(0),
platform::errors::InvalidArgument(
"The number of input tensors in concat op should > 0. But "
"received inputs' length is 0."));
if (inputs_num == 1) {
VLOG(3) << "Warning: concat op have only one input, may waste memory";
}
if (ctx->HasInput("AxisTensor")) {
auto out_dims =
phi::make_ddim(std::vector<int>(inputs_dims[0].size(), -1));
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
} else {
size_t axis =
ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
static_cast<int64_t>(inputs_dims[0].size()));
framework::DDim out_dims =
phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis);
if (out_dims[axis] < 0) {
out_dims[axis] = -1;
}
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
......@@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
PT_INFER_META(phi::ConcatInferMeta));
REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
ops::ConcatGradOpMaker<paddle::imperative::OpBase>);
ops::ConcatGradOpMaker<paddle::imperative::OpBase>,
ConcatInferShapeFunctor);
REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
ops::ConcatDoubleGradOpMaker<paddle::framework::OpDesc>,
ops::ConcatDoubleGradOpMaker<paddle::imperative::OpBase>,
......
......@@ -20,5 +20,5 @@ else()
endif()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n")
file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
......@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace paddle {
namespace operators {
......@@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp {
::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
paddle::operators::LogicalAndFunctor);
REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
paddle::operators::LogicalOrFunctor);
REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
paddle::operators::LogicalNotFunctor);
REGISTER_BINARY_LOGICAL_OP(logical_xor,
"$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
paddle::operators::LogicalXorFunctor);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
namespace paddle {
namespace operators {
template <typename Functor>
class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using InT = typename Functor::ELEMENT_TYPE;
using OutT = bool;
auto functor = Functor();
std::vector<const framework::Tensor*> ins;
std::vector<framework::Tensor*> outs;
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
if (ins.size() == 1) {
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kUnary,
InT, OutT>(
cuda_ctx, ins, &outs, axis, functor);
} else {
paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
InT, OutT>(
cuda_ctx, ins, &outs, axis, functor);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \
REGISTER_OP_CUDA_KERNEL( \
op_name, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int8_t>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int16_t>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int64_t>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>, \
ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor)
REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor)
REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor)
REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor)
#undef REGISTER_LOGICAL_CUDA_KERNEL
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <type_traits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
#define LOGICAL_BINARY_FUNCTOR(func_name, op) \
template <typename T> \
struct func_name { \
using ELEMENT_TYPE = T; \
HOSTDEVICE bool operator()(const T a, const T b) const { \
return static_cast<bool>(a) op static_cast<bool>(b); \
} \
};
LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
#undef LOGICAL_BINARY_FUNCTOR
template <typename T>
struct LogicalNotFunctor {
using ELEMENT_TYPE = T;
HOSTDEVICE bool operator()(const T a) const { return !a; }
};
template <typename DeviceContext, typename Functor>
class BinaryLogicalOpKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEMENT_TYPE;
auto* x = context.Input<framework::Tensor>("X");
auto* y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
Functor binary_func;
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
binary_func, out);
}
};
template <typename DeviceContext, typename Functor>
class UnaryLogicalOpKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEMENT_TYPE;
auto* x = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
Functor unary_func;
platform::Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), x->data<T>(),
x->data<T>() + x->numel(),
out->mutable_data<bool>(context.GetPlace()), unary_func);
}
};
} // namespace operators
} // namespace paddle
#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
REGISTER_OP_##dev##_KERNEL( \
op_type, ::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<bool>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int8_t>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int16_t>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int64_t>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<float>>, \
::paddle::operators::BinaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<double>>);
#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
REGISTER_OP_##dev##_KERNEL( \
op_type, ::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<bool>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int8_t>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int16_t>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<int64_t>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<float>>, \
::paddle::operators::UnaryLogicalOpKernel< \
::paddle::platform::dev##DeviceContext, functor<double>>);
......@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/controlflow/logical_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -14,6 +14,10 @@
#include "paddle/fluid/operators/dot_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
......@@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"),
platform::errors::PreconditionNotMet(
"Input(X) of DotOp should not be null."));
PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"),
platform::errors::PreconditionNotMet(
"Input(Y) of DotOp should not be null."));
PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"),
platform::errors::PreconditionNotMet(
"Output(Out) of DotOp should not be null."));
auto x_dims = ctx->GetInputDim("X");
auto x_rank = static_cast<size_t>(x_dims.size());
PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank,
platform::errors::PreconditionNotMet(
"ShapeError: The dimensions of input tensor X (%s) "
"should be 1 or 2",
x_dims.to_str()));
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(
true, x_rank == (size_t)y_dims.size(),
platform::errors::PreconditionNotMet(
"ShapeError: The shape of input tensor Y: %s should match with "
"input tenosr X: %s",
y_dims.to_str(), x_dims.to_str()));
bool shape_match = true;
for (size_t i = 0; i < x_rank; ++i) {
if (x_dims[i] != y_dims[i]) {
shape_match = false;
break;
}
}
PADDLE_ENFORCE_EQ(true, shape_match,
platform::errors::PreconditionNotMet(
"ShapeError: The shape of input tensor X: %s should "
"be exactly the same "
"with input tensor Y: %s",
x_dims.to_str(), y_dims.to_str()));
auto dims = vectorize(x_dims);
dims[dims.size() - 1] = 1;
ctx->SetOutputDim("Out", phi::make_ddim(dims));
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
......@@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
PT_INFER_META(phi::DotInferMeta));
REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
ops::DotOpGradMaker<paddle::framework::OpDesc>,
ops::DotOpGradMaker<paddle::imperative::OpBase>);
ops::DotOpGradMaker<paddle::imperative::OpBase>,
DotInferShapeFunctor);
REGISTER_OPERATOR(dot_grad, ops::DotGradOp);
......
......@@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
ops::GatherOpKernel<uint8_t>,
ops::GatherOpKernel<int64_t>);
ops::GatherOpKernel<int64_t>,
ops::GatherOpKernel<phi::dtype::bfloat16>);
REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
ops::GatherGradientOpKernel<double>,
ops::GatherGradientOpKernel<int>,
ops::GatherGradientOpKernel<uint8_t>,
ops::GatherGradientOpKernel<int64_t>);
ops::GatherGradientOpKernel<int64_t>,
ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
REGISTER_OP_VERSION(gather)
.AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
paddle::framework::compatible::OpVersionDesc().NewInput(
......
......@@ -130,9 +130,11 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>,
ops::GatherOpCUDAKernel<plat::float16>);
ops::GatherOpCUDAKernel<plat::float16>,
ops::GatherOpCUDAKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
ops::GatherGradOpCUDAKernel<double>,
ops::GatherGradOpCUDAKernel<int64_t>,
ops::GatherGradOpCUDAKernel<int>,
ops::GatherGradOpCUDAKernel<plat::float16>);
ops::GatherGradOpCUDAKernel<plat::float16>,
ops::GatherGradOpCUDAKernel<plat::bfloat16>);
......@@ -29,6 +29,7 @@ namespace operators {
using DataLayout = framework::DataLayout;
enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
#define ALIGN_BYTES 16
#define CHECK_CASE(i, flags, kernel_name, ...) \
if (i == flags) { \
......@@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
template <typename T>
__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
int imsize, int groups,
int group_size, T* mean, T* var,
const DataLayout data_layout) {
int group_size, T* mean, T* var) {
int gid = blockIdx.y;
int cid = blockIdx.x;
int bid = blockIdx.z;
......@@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
T x_mean = 0, x_var = 0;
for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
T val;
if (data_layout == DataLayout::kNCHW) {
val = x[(bid * C + ccid) * imsize + imid];
} else {
int hid = imid / W;
int wid = imid % W;
val = x[(bid * H + hid) * W * C + wid * C + ccid];
}
x_mean += val;
x_var += val * val;
}
......@@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
}
template <typename T, typename AccT, int VecSize>
__device__ __forceinline__ void ThreadReduce(const T* input, int size,
const int offset, AccT* mean,
AccT* var) {
using VecT = kps::details::VectorType<T, VecSize>;
int tid = threadIdx.x;
if (offset > 0) {
input -= offset;
size += offset;
if (tid >= offset) {
AccT temp = input[tid];
*mean += temp;
*var += temp * temp;
}
size -= blockDim.x;
input += blockDim.x;
}
int remain = size % (VecSize * blockDim.x);
T ins[VecSize];
VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
// vector part
for (; VecSize * tid < (size - remain); tid += blockDim.x) {
*ins_vec = reinterpret_cast<const VecT*>(input)[tid];
#pragma unroll
for (int i = 0; i < VecSize; ++i) {
AccT temp = ins[i];
*mean += temp;
*var += temp * temp;
}
}
// scalar part
tid = size - remain + threadIdx.x;
for (; tid < size; tid += blockDim.x) {
AccT temp = input[tid];
*mean += temp;
*var += temp * temp;
}
}
template <typename T>
__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
int i = blockIdx.x;
T x_mean = 0, x_var = 0;
for (int j = threadIdx.x; j < size; j += blockDim.x) {
T val;
val = x[i * size + j];
x_mean += val;
x_var += val * val;
}
x_mean /= size;
x_var /= size;
CudaAtomicAddWithWarp(&mean[i], x_mean);
CudaAtomicAddWithWarp(&var[i], x_var);
}
template <typename T, typename AccT, int VecSize>
__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
int size) {
int i = blockIdx.x;
AccT x_mean = static_cast<AccT>(0);
AccT x_var = static_cast<AccT>(0);
const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
x += i * size;
ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
x_mean, kps::AddFunctor<AccT>());
x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
x_var, kps::AddFunctor<AccT>());
__syncthreads();
if (threadIdx.x == 0) {
mean[i] = static_cast<T>(x_mean / size);
var[i] = static_cast<T>(x_var / size);
}
}
template <typename T, int flags>
__global__ void GroupNormForward(const T* x, const T* mean, const T* var,
const T* scale, const T* bias, int N, int C,
......@@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
int H = imsize / W;
int ccid = gid * group_size + cid;
if (ccid >= C) return;
T x_mean = mean[bid * groups + gid];
T x_var = var[bid * groups + gid];
auto ng = bid * groups + gid;
T x_mean = mean[ng];
T x_var = var[ng];
x_var = x_var - x_mean * x_mean;
T var_inv = 1.0 / sqrt(x_var + epsilon);
if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
T var_inv = rsqrt(x_var + epsilon);
if (cid == 0 && threadIdx.x == 0) {
real_var[ng] = x_var;
}
for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
T val;
int hid, wid;
int index = (bid * C + ccid) * imsize + imid;
if (data_layout == DataLayout::kNCHW) {
val = x[(bid * C + ccid) * imsize + imid];
val = x[index];
} else {
hid = imid / W;
wid = imid % W;
val = x[(bid * H + hid) * W * C + wid * C + ccid];
}
val = (val - x_mean) * var_inv;
if (flags & kHasScale) val *= scale[gid * group_size + cid];
if (flags & kHasBias) val += bias[gid * group_size + cid];
if (flags & kHasScale) {
val *= scale[ccid];
}
if (flags & kHasBias) {
val += bias[ccid];
}
if (data_layout == DataLayout::kNCHW) {
y[(bid * C + ccid) * imsize + imid] = val;
y[index] = val;
} else {
y[(bid * H + hid) * W * C + wid * C + ccid] = val;
}
......@@ -182,16 +266,41 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
imsize *= x_dims[i];
}
}
#ifdef __HIPCC__
int block_size = std::max(std::min(256, imsize), 64);
#else
int block_size = std::min(1024, imsize);
#endif
dim3 grid(group_size, groups, x_dims[0]);
dim3 threads(block_size, 1, 1);
if (data_layout == DataLayout::kNCHW) {
using AccT = typename details::MPTypeTrait<T>::Type;
constexpr int vec_size = sizeof(float4) / sizeof(T);
int size = group_size * imsize;
const int max_num_threads = 1024;
int max_block_size = std::min(size / vec_size, max_num_threads);
int block_size_nchw = 1;
while (block_size_nchw < max_block_size) {
block_size_nchw *= 2;
}
block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
dim3 grids(x_dims[0] * groups);
dim3 blocks(block_size_nchw);
if (size < vec_size) {
ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
x_data, mean_data, temp_var_data, size);
} else {
VectorizedGetMeanAndVarNCHW<
T, AccT, vec_size><<<grids, blocks, 0, dev_ctx.stream()>>>(
x_data, mean_data, temp_var_data, size);
}
} else {
GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
temp_var_data, data_layout);
temp_var_data);
}
int flags =
(scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,
......
......@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/index_sample_op.h"
#include <vector>
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
class IndexSampleOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Inputs(Input) of FindByIndex should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
platform::errors::InvalidArgument(
"Inputs(Index) of FindByIndex should not be null."));
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
input_dims.size(), 2,
platform::errors::InvalidArgument(
"Inputs(X) shape of IndexSample op should be 2-D, but "
"got X's shape = [%s], please check X shape.",
input_dims));
auto index_dims = ctx->GetInputDim("Index");
PADDLE_ENFORCE_EQ(
input_dims.size(), 2,
platform::errors::InvalidArgument(
"Inputs(Index) shape of IndexSample op should be 2-D, but "
"got Index's shape [%s] , please check index shape.",
input_dims));
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0],
platform::errors::InvalidArgument(
"Inputs(X)'s value of dimension 0 must same with "
"Inputs(Index)'s value of dimension 0, but "
"got %d of Inputs(X), and got %d of Inputs(Index), "
"please check Inputs shape.",
input_dims[0], index_dims[0]));
}
ctx->SetOutputDim("Out", index_dims);
auto type = ctx->GetInputsVarType("Index")[0];
if (type == framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("Index", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
......@@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
} // namespace paddle
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
PT_INFER_META(phi::IndexSampleInferMeta));
REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
ops::IndexSampleGradMaker<paddle::imperative::OpBase>);
ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
IndexSampleInferShapeFunctor);
REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp,
ops::IndexSampleGradNoNeedBufferVarInferer);
REGISTER_OP_CPU_KERNEL(
index_sample,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
index_sample_grad,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/index_sample_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#define PREDEFINED_BLOCK_SIZE_X 512
#define PREDEFINED_BLOCK_SIZE 1024
#define MIN(a, b) ((a) < (b) ? (a) : (b))
namespace paddle {
namespace operators {
namespace {
void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
.GetCUDAMaxGridDimSize();
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
}
}
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, typename IndexT = int>
__global__ void IndexSampleForward(const IndexT* index, const T* in_data,
T* out_data, size_t index_length,
size_t input_length, size_t batch_size) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
}
}
}
template <typename T, typename IndexT = int>
__global__ void IndexSampleGrad(const IndexT* index, T* in_grad,
const T* out_grad, size_t index_length,
size_t input_length, size_t batch_size,
bool same_data_in_row = true) {
unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
index_i = blockDim.x * blockIdx.x + threadIdx.x;
for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
unsigned int index_idx = index_j * index_length + index_i;
unsigned int in_idx = index_j * input_length + index_i;
IndexT sample_idx = index[index_idx];
if (same_data_in_row) {
platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
out_grad[sample_idx]);
} else {
in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
}
}
}
}
template <typename T>
class IndexSampleKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<LoDTensor>("X");
auto* index = ctx.Input<LoDTensor>("Index");
auto* output = ctx.Output<LoDTensor>("Out");
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
const auto* in_data = input->data<T>();
auto* out_data = output->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
auto input_dim = input->dims();
auto index_dim = index->dims();
size_t batch_size = input_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
auto block_width = platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
int block_height =
platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length,
batch_size);
} else if (index_type == framework::proto::VarType::INT32) {
const int* index_data = index->data<int>();
IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data, in_data, out_data, index_length, input_length,
batch_size);
}
}
};
template <typename T>
class IndexSampleGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* output_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
auto* input_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
auto* index = ctx.Input<LoDTensor>("Index");
const auto* output_grad_data = output_grad->data<T>();
auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
auto input_num = input_grad->numel();
auto input_dim = input_grad->dims();
auto index_dim = index->dims();
size_t batch_size = index_dim[0];
size_t input_length = input_dim[1];
size_t index_length = index_dim[1];
bool same_data_in_index_row = index_length == 1 ? false : true;
auto block_width = platform::RoundToPowerOfTwo(index_length);
block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
auto block_height =
platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
dim3 block_dim(block_width, block_height);
dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
(batch_size + block_dim.y - 1) / block_dim.y);
LimitGridDim(ctx, &grid_dim);
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
set_zero(dev_ctx, input_grad, static_cast<T>(0));
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
index_data, input_grad_data, output_grad_data, index_length,
input_length, batch_size, same_data_in_index_row);
} else if (index_type == framework::proto::VarType::INT32) {
const int* index_data = index->data<int>();
IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
index_data, input_grad_data, output_grad_data, index_length,
input_length, batch_size, same_data_in_index_row);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
index_sample,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
index_sample_grad,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include <fstream>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DDim = framework::DDim;
template <typename T, typename IndexT = int>
void IndexSampleInner(const framework::ExecutionContext &context,
const LoDTensor &input, const LoDTensor &index,
LoDTensor *output) {
auto input_dims = input.dims();
auto index_dims = index.dims();
int batch_size = input_dims[0];
auto value_length = input_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> input_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(input, context.device_context(),
&input_vec);
paddle::framework::TensorToVector(index, context.device_context(),
&index_vec);
std::vector<T> res(index_ids_num);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i], 0,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i], value_length,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
T v = input_vec[v_i];
VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
<< " value = " << v;
res[i] = v;
}
auto ddim = phi::make_ddim({batch_size, index_length});
output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(res, context.device_context(), output);
output->Resize(ddim);
}
template <typename DeviceContext, typename T>
class IndexSampleKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *input_var = ctx.InputVar("X");
auto *index_var = ctx.InputVar("Index");
auto &input_tensor = input_var->Get<LoDTensor>();
auto &index_tensor = index_var->Get<LoDTensor>();
auto *out_var = ctx.OutputVar("Out");
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
const auto &index_type =
framework::TransToProtoVarType(index_tensor.dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSampleInner<T, int>(ctx, input_tensor, index_tensor, out_tensor);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSampleInner<T, int64_t>(ctx, input_tensor, index_tensor, out_tensor);
}
}
};
template <typename T, typename IndexT = int>
void IndexSampleGradInner(const framework::ExecutionContext &context,
const LoDTensor &out_grad, const LoDTensor &index,
LoDTensor *x_grad) {
std::vector<T> out_grad_vec;
std::vector<IndexT> index_vec;
paddle::framework::TensorToVector(out_grad, context.device_context(),
&out_grad_vec);
paddle::framework::TensorToVector(index, context.device_context(),
&index_vec);
auto index_dims = index.dims();
auto x_grad_dims = x_grad->dims();
auto value_length = x_grad_dims[1];
auto index_length = index_dims[1];
int index_ids_num = index.numel();
std::vector<T> x_grad_vec(x_grad->numel(), 0);
for (int i = 0; i < index_ids_num; i++) {
int b = floor(i / index_length);
PADDLE_ENFORCE_GE(
index_vec[i], 0,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
PADDLE_ENFORCE_LT(
index_vec[i], value_length,
platform::errors::InvalidArgument(
"Variable value (index) of OP(index_sample_grad) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
value_length, index_vec[i]));
int v_i = b * value_length + static_cast<int>(index_vec[i]);
x_grad_vec[v_i] += out_grad_vec[i];
}
x_grad->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad);
x_grad->Resize(x_grad_dims);
}
template <typename DeviceContext, typename T>
class IndexSampleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *index_var = context.InputVar("Index");
auto *x_grad_var = context.OutputVar(framework::GradVarName("X"));
auto *out_grad_var = context.InputVar(framework::GradVarName("Out"));
auto &index_tensor = index_var->Get<LoDTensor>();
auto &out_grad_tensor = out_grad_var->Get<LoDTensor>();
auto *x_grad_tensor = x_grad_var->GetMutable<framework::LoDTensor>();
const auto &index_type =
framework::TransToProtoVarType(index_tensor.dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSampleGradInner<T, int>(context, out_grad_tensor, index_tensor,
x_grad_tensor);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSampleGradInner<T, int64_t>(context, out_grad_tensor, index_tensor,
x_grad_tensor);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/index_sample_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -474,11 +474,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
for (int it = 0; it < LDGS; it++) {
#pragma unroll
for (int jt = 0; jt < VecSize; jt++) {
U x_tmp = x[it][jt];
U x_tmp = static_cast<U>(x[it][jt]);
U y_tmp = var_cur_row * (x_tmp - mean_cur_row);
U dy_tmp = static_cast<U>(gamma[it][jt]) *
static_cast<U>(dout[it][jt]); // scale * dy
U dout_tmp = dout[it][jt]; // dy
U dout_tmp = static_cast<U>(dout[it][jt]); // dy
// used for get dx (row reduction)
sum_loss1 += dy_tmp; // scale * dy, sum_1
......
......@@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL(
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
#elif CUDNN_VERSION_MIN(8, 1, 0)
REGISTER_OP_CUDA_KERNEL(
layer_norm,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(
layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
plat::bfloat16>);
#else
REGISTER_OP_CUDA_KERNEL(
layer_norm,
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
......@@ -502,32 +503,29 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
out.mutable_value()->mutable_data<T>(
phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
int r =
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i;
}
auto* out_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>();
auto* y_data = out.mutable_value()->data<T>();
auto* x_data = input.value().data<T>();
int xm = input_rows.size();
int ym = merge_rows.size();
int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
auto r = xpu::add(context.x_context(), &input_data[i * input_width],
&out_data[out_i * input_width],
&out_data[out_i * input_width], n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API return wrong value[%d %s], ", r,
XPUAPIErrorMsg[r]));
}
xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
merge_rows.data(), ym * sizeof(int64_t));
memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
input_rows.data(), xm * sizeof(int64_t));
int r =
xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
x_rows_data, y_rows_data, xm, n, ym);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
}
void operator()(const platform::XPUDeviceContext& context,
......@@ -582,15 +580,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
{static_cast<int64_t>(merged_row_set.size()), input_width}),
context.GetPlace());
int r =
xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
merge_rows.size() * input_width, static_cast<T>(0.f));
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU constant op return"
" wrong value[%d %s].",
r, XPUAPIErrorMsg[r]));
float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
float* y_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
......@@ -603,17 +593,22 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
}
auto& input_rows = input->rows();
auto* x_data = input->value().data<T>();
int xm = input_rows.size();
int ym = merge_rows.size();
int n = input_width;
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
auto r = xpu::add(
context.x_context(), input->value().data<T>() + i * input_width,
&out_data[out_i * input_width], &out_data[out_i * input_width], n);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API return wrong value[%d %s], ", r,
XPUAPIErrorMsg[r]));
}
xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
merge_rows.data(), ym * sizeof(int64_t));
memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
input_rows.data(), xm * sizeof(int64_t));
int r =
xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
x_rows_data, y_rows_data, xm, n, ym);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
}
}
};
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -445,6 +446,7 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::bfloat16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>;
template struct MergeAdd<platform::CUDADeviceContext,
platform::complex<double>>;
......
......@@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker
"The fp32 beta1 power accumulator tensor. Its shape is [1].");
AddOutput("Beta2Pow",
"The fp32 beta2 power accumulator tensor. Its shape is [1].");
AddOutput("FusedIndices",
"The param index of each element in FP32FusedParam. Its shape is "
"[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
AddOutput(
"FusedParamOffsets",
"The numel offset of each parameter inside the FP32FusedParam. Its "
"shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
"+ n_2, ...].");
AddOutput("FP32ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp32_local_param_num + 1].");
AddOutput("FP16ShardFusedParamOffsets",
"+ n_2, ...]. It should be in CPUPlace.");
AddOutput(
"FP32ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp16_local_param_num + 1].");
"Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
AddOutput(
"WeightDecay",
"The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
"FP16ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
AddOutput("ParamInfo",
"The param info. It should be in CPUPlace, and its shape is [6]"
"CPUPlace, and its shape is [6]. It is "
"CPUPlace, and its shape is [8]. It is "
"[fp32_shard_param_start_idx, fp32_local_param_num, "
"fp32_global_param_num, fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num].");
"fp32_global_param_num, fp32_weight_decay_end_idx, "
"fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num, "
"fp16_weight_decay_end_idx].");
AddOutput("ParamOrder",
"The reordered parameter order. Inside this op, "
"the parameter would be reordered by data type and weight decay "
"value.");
AddOutput("ParamOut", "The output parameter list.").AsDuplicable();
AddOutput("MasterParamOut",
"The output master parameter list. It would share the memory of "
......@@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker
AddAttr<float>("beta1", "The initial value of Beta1Pow.");
AddAttr<float>("beta2", "The initial value of Beta2Pow.");
AddAttr<std::vector<float>>(
"weight_decay",
"The weight decay for each parameter. Its "
"shape is equal to the global parameter number.");
AddAttr<std::vector<int>>("apply_weight_decay",
"Whether to apply weight decay.");
AddAttr<int>("alignment", "The alignment in bytes for the fused tensors.");
AddAttr<int>("rank", "The global rank of the current process.");
AddAttr<int>("nranks", "The global world size.");
......
......@@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin,
<< ") , dtype = " << fused_out->dtype();
}
template <typename OffsetT, typename IndexT>
static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets,
IndexT *out,
int offset_num,
int out_num) {
CUDA_KERNEL_LOOP_TYPE(i, out_num, int) {
auto idx = phi::funcs::LowerBound(offsets, offset_num, i);
if (idx == offset_num || offsets[idx] != i) {
--idx;
}
out[i] = idx;
}
}
template <typename T>
static void CopyVectorToTensor(const std::vector<T> &src,
framework::Tensor *dst,
const platform::Place &place,
gpuStream_t stream) {
dst->Resize({static_cast<int64_t>(src.size())});
T *dst_ptr = dst->mutable_data<T>(place);
const T *src_ptr = src.data();
auto nbytes = src.size() * sizeof(T);
memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream);
}
template <typename T>
static void CopyVectorToCPUTensor(const std::vector<T> &src,
framework::Tensor *dst) {
......@@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector<T> &src,
std::memcpy(dst_ptr, src_ptr, nbytes);
}
static size_t ReorderParamGradInfoList(const std::vector<int> &flags,
std::vector<ParamGradInfo> *infos) {
size_t n = infos->size();
std::vector<int> cur_flags;
cur_flags.reserve(n);
for (size_t i = 0; i < n; ++i) {
auto idx = (*infos)[i].idx;
cur_flags.push_back(flags[idx]);
}
auto origin_infos = *infos;
size_t j = 0;
for (size_t i = 0; i < n; ++i) {
if (cur_flags[i]) {
(*infos)[j] = origin_infos[i];
++j;
}
}
size_t ret_idx = j;
for (size_t i = 0; i < n; ++i) {
if (!cur_flags[i]) {
(*infos)[j] = origin_infos[i];
++j;
}
}
return ret_idx;
}
template <typename T>
static T ClipByBound(T x, T low_value, T high_value) {
if (x < low_value) return low_value;
if (x > high_value) return high_value;
return x;
}
template <typename T>
class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
......@@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
info->numel_offset = 0; // not determined yet
}
}
const auto &apply_weight_decay =
ctx.Attr<std::vector<int>>("apply_weight_decay");
size_t fp32_wd_end_idx =
ReorderParamGradInfoList(apply_weight_decay, &fp32_infos);
size_t fp16_wd_end_idx =
ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
auto *param_order_t = ctx.Output<framework::Tensor>("ParamOrder");
auto param_num = fp32_infos.size() + fp16_infos.size();
param_order_t->Resize({static_cast<int16_t>(param_num)});
auto *param_order = param_order_t->mutable_data<int>(platform::CPUPlace());
for (size_t i = 0; i < fp32_infos.size(); ++i) {
param_order[i] = static_cast<int>(fp32_infos[i].idx);
}
for (size_t i = 0; i < fp16_infos.size(); ++i) {
param_order[i + fp32_infos.size()] = static_cast<int>(fp16_infos[i].idx);
}
VLOG(10) << "Fill ParamGradInfo ends";
// Step 2: determine the numel_with_padding and numel_offset
......@@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
VLOG(10) << "Found the sharding arguments";
auto *param_info_t = ctx.Output<framework::Tensor>("ParamInfo");
param_info_t->Resize({6});
param_info_t->Resize({8});
auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace());
param_info[0] = static_cast<int>(fp32_start_idx);
param_info[1] = static_cast<int>(fp32_local_param_num);
param_info[2] = static_cast<int>(fp32_infos.size());
param_info[3] = static_cast<int>(fp16_start_idx + fp32_infos.size());
param_info[4] = static_cast<int>(fp16_local_param_num);
param_info[5] = static_cast<int>(fp16_infos.size());
param_info[3] = ClipByBound<int>(fp32_wd_end_idx, fp32_start_idx,
fp32_start_idx + fp32_local_param_num) -
static_cast<int>(fp32_start_idx);
param_info[4] = static_cast<int>(fp16_start_idx + fp32_infos.size());
param_info[5] = static_cast<int>(fp16_local_param_num);
param_info[6] = static_cast<int>(fp16_infos.size());
param_info[7] = ClipByBound<int>(fp16_wd_end_idx, fp16_start_idx,
fp16_start_idx + fp16_local_param_num) -
static_cast<int>(fp16_start_idx);
VLOG(10) << "Start FP32 idx: " << param_info[0];
VLOG(10) << "Local FP32 param num: " << param_info[1];
VLOG(10) << "Global FP32 param num: " << param_info[2];
VLOG(10) << "Start FP16 idx: " << param_info[3];
VLOG(10) << "Local FP16 param num: " << param_info[4];
VLOG(10) << "Global FP16 param num: " << param_info[5];
VLOG(10) << "Start FP16 idx: " << param_info[4];
VLOG(10) << "Local FP16 param num: " << param_info[5];
VLOG(10) << "Global FP16 param num: " << param_info[6];
// For WeightDecay, shard and perform H2D copy
const auto &origin_weight_decay =
ctx.Attr<std::vector<float>>("weight_decay");
PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(),
platform::errors::InvalidArgument(
"The attr(weight_decay) should have the "
"same length with Input(Param)."));
std::vector<float> shard_weight_decay;
shard_weight_decay.reserve(total_local_param_num);
for (size_t i = 0; i < fp32_local_param_num; ++i) {
shard_weight_decay.push_back(
origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]);
}
for (size_t i = 0; i < fp16_local_param_num; ++i) {
shard_weight_decay.push_back(
origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]);
}
// For FusedIndices, launch CUDA kernel to do binary search
auto *fused_indices_t = ctx.Output<framework::Tensor>("FusedIndices");
fused_indices_t->Resize({static_cast<int64_t>(total_numel)});
auto *fused_indices = fused_indices_t->mutable_data<int>(place);
std::vector<int> numel_offsets;
numel_offsets.reserve(params.size() + 1);
for (const auto &info : fp32_infos) {
......@@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
"The numel_offsets number must be one larger than "
"the parameter number."));
VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets);
auto *fused_param_offset_t =
ctx.Output<framework::Tensor>("FusedParamOffsets");
fused_param_offset_t->Resize({static_cast<int64_t>(numel_offsets.size())});
auto *fused_param_offset = fused_param_offset_t->mutable_data<int>(place);
memory::Copy(place, fused_param_offset, platform::CPUPlace(),
numel_offsets.data(),
numel_offsets.size() * sizeof(numel_offsets[0]), stream);
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel);
LambFillFusedIndicesCUDAKernel<<<config.block_per_grid,
config.thread_per_block, 0, stream>>>(
fused_param_offset, fused_indices, numel_offsets.size() - 1,
total_numel);
std::vector<int> lengths;
lengths.reserve(fp32_local_param_num + fp16_local_param_num);
std::vector<int> fp32_partial_numel_offsets;
fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1);
......@@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
VLOG(10) << "FP32 Partial numel = ["
<< valid_start_n + fp32_infos[i].numel << ","
<< end_n + fp32_infos[i].numel;
lengths.push_back(end_n - valid_start_n);
auto len = end_n - valid_start_n;
fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() +
lengths.back());
len);
}
std::vector<int> fp16_partial_numel_offsets;
......@@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
PADDLE_ENFORCE_NE(valid_start_n, end_n,
platform::errors::InvalidArgument(
"Indices sharding error. This may be a bug."));
lengths.push_back(end_n - valid_start_n);
auto len = end_n - valid_start_n;
fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() +
lengths.back());
len);
}
CopyVectorToCPUTensor(numel_offsets,
......@@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
fp16_partial_numel_offsets,
ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));
// Fill the weight decay tensor
PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(),
platform::errors::InvalidArgument(
"Invalid weight decay sharding. This may be a bug."));
std::vector<float> wd_cpu;
for (size_t i = 0; i < shard_weight_decay.size(); ++i) {
int len = lengths[i];
for (int j = 0; j < len; ++j) {
wd_cpu.push_back(shard_weight_decay[i]);
}
}
PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel,
platform::errors::InvalidArgument(
"Invalid weight decay sharding. This may be a bug."));
CopyVectorToTensor(wd_cpu, ctx.Output<framework::Tensor>("WeightDecay"),
place, stream);
auto *global_scale = ctx.Output<framework::Tensor>("GlobalScale");
if (!global_scale->IsInitialized()) {
TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);
......
......@@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
"The fp32 beta1 power accumulator tensor. Its shape is [1].");
AddInput("Beta2Pow",
"The fp32 beta2 power accumulator tensor. Its shape is [1].");
AddInput("FusedIndices",
"The param index of each element in FP32FusedParam. Its shape is "
"[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
AddInput(
"FusedParamOffsets",
"The numel offset of each parameter inside the FP32FusedParam. Its "
"shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
"+ n_2, ...].");
AddInput("FP32ShardFusedParamOffsets",
"+ n_2, ...]. It should be in CPUPlace.");
AddInput(
"FP32ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp32_local_param_num + 1].");
AddInput("FP16ShardFusedParamOffsets",
"Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
AddInput(
"FP16ShardFusedParamOffsets",
"The sharded numel offset of each parameter in the local rank. "
"Its shape is [fp16_local_param_num + 1].");
AddInput("WeightDecay",
"The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
"Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
AddInput("ParamInfo",
"The param info. It should be in CPUPlace, and its shape is [6]"
"CPUPlace, and its shape is [6]. It is "
"CPUPlace, and its shape is [8]. It is "
"[fp32_shard_param_start_idx, fp32_local_param_num, "
"fp32_global_param_num, fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num].");
"fp32_global_param_num, fp32_weight_decay_end_idx, "
"fp16_shard_param_start_idx, "
"fp16_local_param_num, fp16_global_param_num, "
"fp16_weight_decay_end_idx].");
AddInput("ParamOrder",
"The reordered parameter order. Inside this op, "
"the parameter would be reordered by data type and weight decay "
"value.");
AddInput("LearningRate",
"The fp32 learning rate tensor. Its shape is [1].");
......@@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
"max_global_grad_norm",
"The maximum global gradient l2-norm value for clipping. If "
"max_global_grad_norm <= 0, no clipping would be performed.");
AddAttr<float>("weight_decay", "The weight decay value.");
AddAttr<bool>("clip_after_allreduce",
"Whether to clip before allreduce, only valid when the "
"world size is larger than 1.");
......
......@@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel(
args...);
}
template <typename Functor, int BlockDim, int MaxTensorNumPerLaunch,
int MaxChunkNumPerLaunch, typename... Args>
static void MultiTensorApply(Functor functor, gpuStream_t stream,
const int *offsets, int n, int chunk_size,
Args... args) {
template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch>
class MultiTensorLauncher {
public:
MultiTensorLauncher(
const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta,
const int &chunk_id, const int &chunk_size, const int &block_dim,
const gpuStream_t &stream)
: meta_(meta),
chunk_id_(chunk_id),
chunk_size_(chunk_size),
block_dim_(block_dim),
stream_(stream) {}
template <typename Functor, typename... Args>
void Launch(Functor &&functor, Args &&... args) const {
MultiTensorApplyCUDAKernel<
Functor, MaxTensorNumPerLaunch,
MaxChunkNumPerLaunch><<<chunk_id_, block_dim_, 0, stream_>>>(
functor, meta_, chunk_size_, args...);
}
private:
const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta_;
const int &chunk_id_;
const int &chunk_size_;
const int &block_dim_;
const gpuStream_t &stream_;
};
template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
typename Callback>
static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets,
int n, int chunk_size, int block_dim,
Callback &&callback) {
if (n == 0) return;
constexpr auto NumTensor = MaxTensorNumPerLaunch;
......@@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
int numel_offset = 0;
metas.start_tensor_id = 0;
metas.start_chunk_id = 0;
int launch_num = 0;
MultiTensorLauncher<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> launcher(
metas, chunk_id, chunk_size, block_dim, stream);
for (int i = 0; i < n; ++i) {
auto length = offsets[i + 1] - offsets[i];
if (tensor_id == 0) {
......@@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
bool last_chunk = (i + 1 == n && j + 1 == chunk_num);
if (tensor_full || block_full || last_chunk) {
MultiTensorApplyCUDAKernel<Functor, NumTensor,
NumChunk><<<chunk_id, BlockDim, 0, stream>>>(
functor, metas, chunk_size, args...);
callback(launcher, launch_num);
++launch_num;
chunk_id = 0;
if (j + 1 == chunk_num) { // chunk for the current tensor is full
metas.start_chunk_id = 0;
......@@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
}
}
template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
typename... Args>
static void MultiTensorApply(Functor functor, gpuStream_t stream,
const int *offsets, int n, int chunk_size,
int block_dim, Args &&... args) {
auto callback = [&](const MultiTensorLauncher<MaxTensorNumPerLaunch,
MaxChunkNumPerLaunch> &launcher,
int i) { launcher.Launch(functor, args...); };
MultiTensorApplyWithCallback<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch>(
stream, offsets, n, chunk_size, block_dim, callback);
}
} // namespace operators
} // namespace paddle
......@@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) {
__device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) {
return static_cast<platform::float16>(abs(static_cast<float>(x)));
}
__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) {
return static_cast<platform::bfloat16>(abs(static_cast<float>(x)));
}
__device__ __forceinline__ float inline_abs(float x) { return abs(x); }
__device__ __forceinline__ double inline_abs(double x) { return abs(x); }
......@@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow(
return static_cast<platform::float16>(
pow(static_cast<float>(base), static_cast<float>(exponent)));
}
__device__ __forceinline__ platform::bfloat16 inline_pow(
platform::bfloat16 base, platform::bfloat16 exponent) {
return static_cast<platform::bfloat16>(
pow(static_cast<float>(base), static_cast<float>(exponent)));
}
__device__ __forceinline__ float inline_pow(float base, float exponent) {
return pow(base, exponent);
}
......@@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(p_norm,
ops::PnormCUDAKernel<CUDA, paddle::platform::float16>,
ops::PnormCUDAKernel<CUDA, paddle::platform::bfloat16>,
ops::PnormCUDAKernel<CUDA, float>,
ops::PnormCUDAKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(
p_norm_grad, ops::PnormGradCUDAKernel<CUDA, paddle::platform::float16>,
ops::PnormGradCUDAKernel<CUDA, paddle::platform::bfloat16>,
ops::PnormGradCUDAKernel<CUDA, float>,
ops::PnormGradCUDAKernel<CUDA, double>);
......@@ -23,6 +23,7 @@ REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
CUDAReduceSumGradKernel<paddle::platform::float16>,
CUDAReduceSumGradKernel<paddle::platform::bfloat16>,
CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
......@@ -15,6 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/split_op.h"
#include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
using framework::Tensor;
......@@ -23,52 +26,6 @@ class SplitOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of SplitOp should not be null."));
PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
platform::errors::InvalidArgument(
"Outputs(Out) of SplitOp should not be empty."));
auto in_dims = ctx->GetInputDim("X");
auto outs_names = ctx->Outputs("Out");
size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
std::vector<int> sections = static_cast<std::vector<int>>(
ctx->Attrs().Get<std::vector<int>>("sections"));
const size_t outs_number = outs_names.size();
if (sections.size() > 0) {
PADDLE_ENFORCE_EQ(
sections.size(), outs_number,
platform::errors::InvalidArgument("tensor split sections size "
"should be equal to output size."));
}
if (ctx->HasInput("AxisTensor")) {
auto out_dims = phi::make_ddim(std::vector<int>(in_dims.size(), -1));
std::vector<framework::DDim> outs_dims(outs_number, out_dims);
ctx->SetOutputsDim("Out", outs_dims);
for (size_t i = 0; i < outs_number; ++i) {
ctx->ShareLoD("X", "Out", 0, i);
}
return;
}
bool each_section_is_known =
(sections.size() > 0 && !ctx->HasInputs("SectionsTensorList"));
auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known,
in_dims, num, sections, axis, outs_number);
ctx->SetOutputsDim("Out", outs_dims);
if (axis != 0) {
// Only pass LoD when not spliting along the first dim.
for (size_t i = 0; i < outs_number; ++i) {
ctx->ShareLoD("X", "Out", 0, i);
}
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
......@@ -168,6 +125,10 @@ Example:
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor,
PT_INFER_META(phi::SplitInferMeta));
REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker,
ops::SplitGradMaker<paddle::framework::OpDesc>,
ops::SplitGradMaker<paddle::imperative::OpBase>);
ops::SplitGradMaker<paddle::imperative::OpBase>,
SplitInferShapeFunctor);
......@@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
......@@ -281,10 +281,6 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
paddle::operators::UniformRandomOpVarTypeInference);
REGISTER_OP_CPU_KERNEL(
uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
paddle::operators::CPUUniformRandomKernel<double>,
paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
uniform_random_batch_size_like,
paddle::operators::CPUUniformRandomKernel<float>,
......
......@@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(uniform_random,
paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>);
REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>);
......@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/where_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
......@@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "Where");
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Where");
OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Where");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Where");
auto cond_dims = ctx->GetInputDim("Condition");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(
cond_dims, x_dims,
platform::errors::InvalidArgument(
"The dims of Inputs(Condition) and Inputs(X) should be same. "
"But received Condition's shape is [%s], X's shape is [%s]",
cond_dims, x_dims));
PADDLE_ENFORCE_EQ(x_dims, y_dims,
platform::errors::InvalidArgument(
"The dims of Inputs(X) and Inputs(Y) should be same. "
"But received X's shape is [%s], Y's shape is [%s]",
x_dims, y_dims));
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
} // namespace paddle
namespace ops = paddle::operators;
DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
PT_INFER_META(phi::WhereInferMeta));
REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
ops::WhereOpGradMaker<paddle::framework::OpDesc>,
ops::WhereOpGradMaker<paddle::imperative::OpBase>);
ops::WhereOpGradMaker<paddle::imperative::OpBase>,
WhereInferShapeFunctor);
REGISTER_OPERATOR(where_grad, ops::WhereGradOp,
ops::WhereGradNoNeedBufferVarsInferer);
REGISTER_OP_CPU_KERNEL(
where, ops::WhereKernel<paddle::platform::CPUDeviceContext, float>,
ops::WhereKernel<paddle::platform::CPUDeviceContext, double>,
ops::WhereKernel<paddle::platform::CPUDeviceContext, int>,
ops::WhereKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
where_grad, ops::WhereGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::WhereGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/operators/where_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
namespace platform = paddle::platform;
namespace paddle {
namespace operators {
template <typename T>
struct CondFunctor {
HOSTDEVICE inline CondFunctor() {}
HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const {
return cond ? x : y;
}
};
template <typename T>
__global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x,
const T* y, T* out) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < N; idx += blockDim.x * gridDim.x) {
out[idx] = cond[idx] ? x[idx] : y[idx];
}
}
template <typename T>
__global__ void WhereGradCUDAKernel(const int N, const T* dout,
const bool* cond, T* dx, T* dy) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < N; idx += blockDim.x * gridDim.x) {
if (dx != nullptr) {
dx[idx] = cond[idx] ? dout[idx] : 0.;
}
if (dy != nullptr) {
dy[idx] = cond[idx] ? 0. : dout[idx];
}
}
}
template <typename T>
class WhereKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::Tensor>("Condition");
auto* X = context.Input<framework::Tensor>("X");
auto* Y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
auto numel = condition->numel();
// TODO(GaaoWei8): Input of where can be broadcast
const bool* cond_data = condition->data<bool>();
const T* x_data = X->data<T>();
const T* y_data = Y->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
auto stream = context.cuda_device_context().stream();
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
auto functor = CondFunctor<T>();
std::vector<const framework::Tensor*> ins = {condition, X, Y};
std::vector<framework::Tensor*> outs = {out};
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
};
template <typename T>
class WhereGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::Tensor>("Condition");
const bool* cond_data = condition->data<bool>();
auto numel = condition->numel();
auto* dout_t =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
auto* dout = dout_t->data<T>();
T* dx =
(dx_t != nullptr) ? dx_t->mutable_data<T>(context.GetPlace()) : nullptr;
T* dy =
(dy_t != nullptr) ? dy_t->mutable_data<T>(context.GetPlace()) : nullptr;
auto stream = context.cuda_device_context().stream();
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
auto config = GetGpuLaunchConfig1D(dev_ctx, condition->numel());
WhereGradCUDAKernel<
T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
numel, dout, cond_data, dx, dy);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
where, paddle::operators::WhereKernel<platform::CUDADeviceContext, float>,
paddle::operators::WhereKernel<platform::CUDADeviceContext, double>,
paddle::operators::WhereKernel<platform::CUDADeviceContext, int>,
paddle::operators::WhereKernel<platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
where_grad,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, float>,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, double>,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int>,
paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int64_t>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class WhereKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::Tensor>("Condition");
auto* X = context.Input<framework::Tensor>("X");
auto* Y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
const bool* cond_data = condition->data<bool>();
const T* x_data = X->data<T>();
const T* y_data = Y->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
auto x_numel = X->numel();
for (int i = 0; i < x_numel; i++) {
out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
}
}
};
template <typename DeviceContext, typename T>
class WhereGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<framework::LoDTensor>("Condition");
const auto* cond_data = condition->data<bool>();
auto numel = condition->numel();
auto* dout_t =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
auto* dout = dout_t->data<T>();
if (dx_t != nullptr) {
auto* dx = dx_t->mutable_data<T>(context.GetPlace());
for (int i = 0; i < numel; i++) {
dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
}
}
if (dy_t != nullptr) {
auto* dy = dy_t->mutable_data<T>(context.GetPlace());
for (int i = 0; i < numel; i++) {
dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/where_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -14,7 +14,7 @@
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/where_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include <hip/hip_runtime.h>
#endif
#include <stdio.h>
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/float16.h"
......@@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
#endif
#endif
// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
bfloat16 low_half;
// the bfloat16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
return (val & 0xFFFF0000u) | low_half.x;
}
inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
bfloat16 high_half;
// the bfloat16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}
#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
return *reinterpret_cast<bfloat16 *>(&x);
}
static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
return *reinterpret_cast<__nv_bfloat16 *>(&x);
}
CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
PDBF16ToCUDABF16(val)));
}
#else
CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
// concrete packed bfloat16 value may exsits in lower or higher 16bits
// of the 32bits address.
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t sum;
uint32_t newval;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// the bfloat16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed,
bf16_add_to_low_half(assumed, val_f));
} while (old != assumed);
bfloat16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// the bfloat16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed,
bf16_add_to_high_half(assumed, val_f));
} while (old != assumed);
bfloat16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif
CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
float *real = reinterpret_cast<float *>(address);
float *imag = real + 1;
......
......@@ -81,7 +81,7 @@ set(PYBIND_SRCS
cuda_streams_py.cc)
if(NOT ON_INFER)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
if (WITH_NCCL)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
endif()
......
......@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/distributed/collective/reducer.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/imperative/layer.h"
......@@ -143,6 +144,19 @@ void BindDistributed(py::module *m) {
[](distributed::ProcessGroupStrategy &self, int nrings) {
self.nrings_ = nrings;
});
m->def("eager_assign_group_by_size",
[](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
std::vector<size_t> group_size_limits,
std::vector<int64_t> tensor_indices) {
auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
return distributed::Eager_AssignGroupBySize(
tensors, is_sparse_gradient, group_size_limits, tensor_indices);
},
py::arg("tensors"), py::arg("is_sparse_gradient"),
py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
py::arg("tensor_indices") = std::vector<int64_t>{},
py::call_guard<py::gil_scoped_release>());
}
} // end namespace pybind
......
......@@ -60,7 +60,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
{"merged_momentum",
{"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
{"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
{"sparse_momentum",
{"Param", "Grad", "Velocity", "Index", "LearningRate", "MasterParam"}},
{"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
{"run_program", {"X", "Params"}},
{"fused_feedforward",
......@@ -124,7 +125,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
{"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"rnn", {"DropoutState", "Reserve", "Out", "State"}},
{"run_program", {"DOut"}},
{"adam",
......@@ -181,7 +182,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
"out_old_num_accumulates", "out_num_updates"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"batch_norm", {"MeanOut", "VarianceOut"}},
{"sync_batch_norm", {"MeanOut", "VarianceOut"}},
{"accuracy", {"Correct", "Total"}},
......
......@@ -40,6 +40,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
);
}
// Type Constrait for concrete DenseTensor type.
class DenseTensor<string target, string precision, string layout> :
Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">,
"!infrt.DenseTensor<"#target#","#precision#","#layout#">",
"::infrt::DenseTensorType">;
// Base class for infrt dialect attributes.
class Infrt_Attr<string name, list<Trait> traits = [],
string baseCppClass = "::mlir::Attribute">
......
......@@ -21,8 +21,8 @@
#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
#include "paddle/infrt/dialect/infrt_base.h"
#include "paddle/infrt/dialect/pd_ops.h"
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/phi_base.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include "paddle/infrt/dialect/tensor_shape.h"
namespace infrt {
......
......@@ -2,16 +2,7 @@ if (NOT INFRT_WITH_PHI)
return()
endif()
#mlir_tablegen_on(infrt_phi_base DIALECT phi)
add_mlir_dialect(infrt_phi_base phi)
add_mlir_dialect(infrt_phi_tensor phi_dt)
add_mlir_dialect(infrt_phi_kernel phi_kernel)
#mlir_tablegen_on(infrt_phi_tensor)
gather_srcs(infrt_src SRCS
phi_base.cc infrt_phi_tensor.cc
infrt_phi_tensor.cc)
add_subdirectory(ir)
add_subdirectory(pass)
add_executable(phi-exec phi_exec.cc)
......
#mlir_tablegen_on(infrt_phi_base DIALECT phi)
add_mlir_dialect(infrt_phi_base phi)
add_mlir_dialect(infrt_phi_tensor phi_dt)
add_mlir_dialect(infrt_phi_kernel phi_kernel)
#mlir_tablegen_on(infrt_phi_tensor)
gather_srcs(infrt_src SRCS
phi_base.cc
infrt_phi_tensor.cc)
......@@ -4,7 +4,7 @@
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/infrt_base.td"
include "paddle/infrt/dialect/phi/infrt_phi_base.td"
include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
def PHI_KernelDialect : Dialect {
let name = "phi_kernel";
......
......@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
#include <mlir/IR/BuiltinTypes.h>
#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.cpp.inc"
namespace infrt {
namespace phi {
......@@ -25,7 +25,7 @@ namespace phi {
void PHIDenseTensorDialect::initialize() {
#define GET_OP_LIST
addOperations<
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"
>();
}
......@@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() {
} // namespace infrt
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc" // NOLINT
......@@ -29,11 +29,11 @@
#include <mlir/Interfaces/LoopLikeInterface.h>
#include <mlir/Interfaces/SideEffectInterfaces.h>
#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc"
#include "paddle/infrt/dialect/dense_tensor.h"
#include "paddle/infrt/dialect/phi/phi_base.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
// NOLINT
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
......@@ -2,7 +2,7 @@
#else
#define PHI_TENSOR
include "paddle/infrt/dialect/phi/infrt_phi_base.td"
include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/OpBase.td"
include "paddle/infrt/dialect/infrt_base.td"
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/infrt/dialect/phi/phi_base.h"
#include "paddle/infrt/dialect/phi/ir/phi_base.h"
#include <mlir/IR/Builders.h>
#include <mlir/IR/Dialect.h>
......@@ -21,8 +21,8 @@
#include <mlir/IR/TypeUtilities.h>
#include <mlir/IR/Types.h>
#include "paddle/infrt/common/global.h"
#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc"
namespace infrt {
namespace phi {
......@@ -51,11 +51,11 @@ void PHIDialect::printType(::mlir::Type type,
void PHIDialect::initialize() {
addOperations<
#define GET_OP_LIST
#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" // NOLINT
>();
addTypes<
#define GET_TYPEDEF_LIST
#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT
>();
}
......@@ -81,4 +81,4 @@ mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
} // namespace infrt
#define GET_TYPEDEF_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT
......@@ -19,11 +19,13 @@
#include <string>
#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc"
#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
#define GET_TYPEDEF_CLASSES
#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc"
#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
#define GET_OP_CLASSES
#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc"
namespace mlir {
namespace OpTrait {
......
......@@ -73,7 +73,7 @@ using ValueVariantType =
std::vector<phi::DenseTensor>,
paddle::experimental::ScalarBase<phi::DenseTensor>,
paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
std::vector<phi::MetaTensor>,
std::vector<phi::MetaTensor*>,
phi::MetaConfig,
paddle::experimental::Backend,
paddle::experimental::DataLayout,
......
......@@ -94,12 +94,16 @@ std::vector<Tensor> split_impl(const Tensor& x,
std::vector<Tensor> out;
auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out);
std::vector<phi::MetaTensor> meta_outs;
meta_outs.reserve(out_number);
std::vector<phi::MetaTensor*> meta_out_ptrs;
meta_out_ptrs.reserve(out_number);
for (size_t i = 0; i < out_number; ++i) {
meta_outs.push_back(dense_outs[i]);
meta_out_ptrs.push_back(&meta_outs.back());
}
phi::SplitInferMeta(
MakeMetaTensor(*dense_x), num_or_sections, axis, &meta_outs);
MakeMetaTensor(*dense_x), num_or_sections, axis, meta_out_ptrs);
using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&,
......
......@@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList(
return result;
}
void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
const phi::TensorArgDef& arg_def) {
VLOG(5) << "ResetTensor by TensorArgDef.";
if (phi::DenseTensor::classof(dst)) {
auto* dense_t = static_cast<phi::DenseTensor*>(dst);
auto* meta = phi::DenseTensorUtils::GetMutableMeta(dense_t);
meta->dtype = arg_def.dtype;
meta->layout = arg_def.layout;
} else if (phi::SelectedRows::classof(dst)) {
auto* selected_rows = static_cast<phi::SelectedRows*>(dst);
auto* meta =
phi::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value());
meta->dtype = arg_def.dtype;
meta->layout = arg_def.layout;
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"Unsupported tensor type is received when reseting tensor dtype and "
"layout by argument definition."));
}
}
} // namespace experimental
} // namespace paddle
......@@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable);
phi::ScalarArray MakePhiScalarArrayFromVarList(
const std::vector<framework::Variable*>& variable_list);
void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
const phi::TensorArgDef& arg_def);
} // namespace experimental
} // namespace paddle
......@@ -227,4 +227,12 @@ class GPUContext : public DeviceContext {
// must use different function name for cudnn kernel
using GPUDNNContext = GPUContext;
// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
// because we want to implement a KPS-based kernel and make it run
// on GPU and XPU at the same time, so we need KPSContext when registering
// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
using KPSContext = GPUContext;
#endif
} // namespace phi
......@@ -66,4 +66,12 @@ class XPUContext : public DeviceContext {
std::unique_ptr<Impl> impl_;
};
// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
// because we want to implement a KPS-based kernel and make it run
// on GPU and XPU at the same time, so we need KPSContext when registering
// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
#if PADDLE_WITH_XPU_KP
using KPSContext = XPUContext;
#endif
} // namespace phi
......@@ -52,6 +52,9 @@ enum class Backend : uint8_t {
MKLDNN,
GPUDNN, // cuDNN and hipDNN
// paddle kernel primitives backend
KPS,
// end of backend types
NUM_BACKENDS,
......@@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
case Backend::GPUDNN:
os << "GPUDNN";
break;
case Backend::KPS:
os << "KPS";
break;
default: {
size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(Backend::NUM_BACKENDS);
......@@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
return Backend::MKLDNN;
} else if (s == std::string("GPUDNN")) {
return Backend::GPUDNN;
} else if (s == std::string("KPS")) {
return Backend::KPS;
} else {
return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
phi::GetOrRegisterGlobalDeviceTypeId(s));
......
......@@ -22,8 +22,8 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_
cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
......
......@@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
case phi::Backend::XPU:
return phi::XPUPlace(
set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
#endif
case phi::Backend::KPS:
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return phi::GPUPlace(
set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
#elif defined(PADDLE_WITH_XPU_KP)
return phi::XPUPlace(
set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
#endif
default: {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
......
......@@ -20,16 +20,16 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
auto& kernel_info_map = custom_kernel_map.GetMap();
VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
auto& kernels = KernelFactory::Instance().kernels();
for (auto& pair : kernel_info_map) {
PADDLE_ENFORCE_EQ(
KernelFactory::Instance().HasCompatiblePhiKernel(pair.first),
true,
PADDLE_ENFORCE_NE(
kernels.find(pair.first),
kernels.end(),
phi::errors::InvalidArgument(
"The kernel %s is not ready for custom kernel registering.",
pair.first));
for (auto& info_pair : pair.second) {
auto& kernels = KernelFactory::Instance().kernels();
PADDLE_ENFORCE_EQ(
kernels[pair.first].find(info_pair.first),
kernels[pair.first].end(),
......
......@@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
// Note: When you reset holder, you need to ensure the offset is correct
void DenseTensor::ResetHolder(const std::shared_ptr<phi::Allocation>& holder) {
if (holder_) {
// TODO(zyfncg): The change of static_cast<> in check will recover back
// when SetAllocationForOutputTenosr is deleted.
// Now the numel() may return -1, and will cast to a very large number when
// compare with a data with unsigned long type, this will make checking
// failed, so it's a temporary solution to deal with this problem.
PADDLE_ENFORCE_LE(
numel() * static_cast<int64_t>(SizeOf(dtype())) +
static_cast<int64_t>(meta_.offset),
......
......@@ -75,13 +75,13 @@ paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt(
: paddle::optional<const phi::MetaTensor&>{paddle::none};
}
std::vector<MetaTensor> InferMetaContext::InputsBetween(size_t start,
std::vector<MetaTensor*> InferMetaContext::InputsBetween(size_t start,
size_t end) const {
std::vector<MetaTensor> result;
std::vector<MetaTensor*> result;
result.reserve(end - start);
for (size_t i = start; i < end; ++i) {
result.emplace_back(*inputs_.at(i));
result.push_back(inputs_.at(i).get());
}
return result;
......@@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) {
return outputs_.at(idx).get();
}
std::vector<MetaTensor> InferMetaContext::MutableOutputBetween(size_t start,
std::vector<MetaTensor*> InferMetaContext::MutableOutputBetween(size_t start,
size_t end) {
std::vector<MetaTensor> result;
std::vector<MetaTensor*> result;
result.reserve(end - start);
for (size_t i = start; i < end; ++i) {
result.emplace_back(*outputs_.at(i));
result.emplace_back(outputs_.at(i).get());
}
return result;
}
......
......@@ -50,13 +50,13 @@ class InferMetaContext {
const std::pair<int, int>& OutputRangeAt(size_t idx) const;
const MetaConfig& GetMetaConfig() const;
const MetaTensor& InputAt(size_t idx) const;
const MetaTensor& InputAt(size_t idx) const;
paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const;
std::vector<MetaTensor*> InputsBetween(size_t start, size_t end) const;
std::vector<MetaTensor> InputsBetween(size_t start, size_t end) const;
MetaTensor* MutableOutputAt(size_t idx);
std::vector<MetaTensor> MutableOutputBetween(size_t start, size_t end);
std::vector<MetaTensor*> MutableOutputBetween(size_t start, size_t end);
template <typename AttrType>
AttrType AttrAt(size_t idx) {
......@@ -157,7 +157,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
};
template <typename... Tail>
struct InferMetaFnCallHelper<const std::vector<MetaTensor>&, Tail...> {
struct InferMetaFnCallHelper<const std::vector<MetaTensor*>&, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
static_assert(attr_idx == 0,
......@@ -165,7 +165,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
static_assert(out_idx == 0,
"InferMeta's Input should appear before Outputs.");
const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
std::vector<MetaTensor> arg =
std::vector<MetaTensor*> arg =
ctx->InputsBetween(range.first, range.second);
InferMetaFnCallHelper<
Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
......@@ -210,13 +210,12 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
};
template <typename... Tail>
struct InferMetaFnCallHelper<std::vector<MetaTensor>*, Tail...> {
struct InferMetaFnCallHelper<std::vector<MetaTensor*>, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
std::vector<MetaTensor> tmp =
std::vector<MetaTensor*> arg =
ctx->MutableOutputBetween(range.first, range.second);
std::vector<MetaTensor>* arg = &tmp;
InferMetaFnCallHelper<
Tail...>::template Call<in_idx, attr_idx, out_idx + 1>(ctx,
pargs...,
......
......@@ -87,13 +87,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
default_tensor_layout,
default_key.dtype(),
arg_type);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
} else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
args_def->AppendInput(default_key.backend(),
default_tensor_layout,
default_key.dtype(),
arg_type);
#endif
} else if (arg_type == std::type_index(typeid(DenseTensor*))) {
args_def->AppendOutput(default_key.backend(),
default_tensor_layout,
......@@ -105,13 +103,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
default_tensor_layout,
default_key.dtype(),
arg_type);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
} else if (arg_type == std::type_index(typeid(SelectedRows*))) {
args_def->AppendOutput(default_key.backend(),
default_tensor_layout,
default_key.dtype(),
arg_type);
#endif
} else {
// Attribute deal with
// TODO(chenweihang): now here allow any types of attribute, maybe
......
......@@ -23,9 +23,7 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_context.h"
#ifndef PADDLE_WITH_CUSTOM_KERNEL
#include "paddle/phi/core/selected_rows.h"
#endif
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/type_defs.h"
......@@ -223,9 +221,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
#endif
PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
......@@ -260,9 +256,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
#ifndef PADDLE_WITH_CUSTOM_KERNEL
PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
#endif
PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
......
......@@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/utils/any.h"
#include "paddle/utils/optional.h"
// Note: mixed_vector include many header now, LoD will be
// used on CUDA device? Can we use small_vector here?
// @zhanlve: Rollback to original LoD for now
#ifndef PADDLE_WITH_CUSTOM_KERNEL
#include "paddle/fluid/framework/mixed_vector.h"
#endif
namespace phi {
using DDim = phi::DDim;
......
......@@ -225,6 +225,41 @@ void HuberLossInferMeta(const MetaTensor& input,
out->share_lod(input);
}
void IndexSampleInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config) {
auto input_dims = x.dims();
PADDLE_ENFORCE_EQ(input_dims.size(),
2,
errors::InvalidArgument(
"Inputs(X) shape of IndexSample op should be 2-D, but "
"got X's shape = [%s], please check X shape.",
input_dims));
auto index_dims = y.dims();
PADDLE_ENFORCE_EQ(
index_dims.size(),
2,
errors::InvalidArgument(
"Inputs(Index) shape of IndexSample op should be 2-D, but "
"got Index's shape [%s] , please check index shape.",
input_dims));
if (config.is_runtime) {
PADDLE_ENFORCE_EQ(input_dims[0],
index_dims[0],
errors::InvalidArgument(
"Inputs(X)'s value of dimension 0 must same with "
"Inputs(Index)'s value of dimension 0, but "
"got %d of Inputs(X), and got %d of Inputs(Index), "
"please check Inputs shape.",
input_dims[0],
index_dims[0]));
}
out->set_dtype(x.dtype());
out->set_dims(index_dims);
out->share_lod(y);
}
void CrossInferMeta(const MetaTensor& x,
const MetaTensor& y,
int axis,
......@@ -271,8 +306,7 @@ void CrossInferMeta(const MetaTensor& x,
}
void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
auto in_dims = x.dims();
out->set_dims(in_dims);
out->share_meta(x);
}
void BCELossInferMeta(const MetaTensor& input,
......
......@@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
MetaTensor* residual,
MetaConfig config = MetaConfig());
void IndexSampleInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config = MetaConfig());
void CrossInferMeta(const MetaTensor& x,
const MetaTensor& y,
int axis,
......
......@@ -84,7 +84,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
out->set_dtype(x.dtype());
}
void ConcatInferMeta(const std::vector<MetaTensor>& x,
void ConcatInferMeta(const std::vector<MetaTensor*>& x,
const Scalar& axis_scalar,
MetaTensor* out,
MetaConfig config) {
......@@ -93,10 +93,19 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
phi::errors::InvalidArgument(
"The size of input meta vector should be greater"
"than 0."));
if (axis_scalar.FromTensor()) {
auto out_dims =
phi::make_ddim(std::vector<int>(x.at(0)->dims().size(), -1));
out->set_dims(out_dims);
out->set_dtype(x.at(0)->dtype());
out->set_layout(x.at(0)->layout());
out->share_lod(*x.at(0));
return;
}
int axis = axis_scalar.to<int>();
// 1. calculate axis
int rank = x.at(0).dims().size();
int rank = x.at(0)->dims().size();
PADDLE_ENFORCE_EQ(
axis >= -rank && axis < rank,
true,
......@@ -111,15 +120,42 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
// 2. calculate out dims
std::vector<phi::DDim> x_dims;
for (auto& x_t : x) {
x_dims.push_back(x_t.dims());
x_dims.reserve(x.size());
for (const auto* x_t : x) {
x_dims.emplace_back(x_t->dims());
}
phi::DDim out_dim =
phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis);
out->set_dims(out_dim);
out->set_dtype(x.at(0).dtype());
out->set_layout(x.at(0).layout());
out->set_dtype(x.at(0)->dtype());
out->set_layout(x.at(0)->layout());
out->share_lod(*x.at(0));
}
void WhereInferMeta(const MetaTensor& condition,
const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out) {
auto cond_dims = condition.dims();
auto x_dims = x.dims();
auto y_dims = y.dims();
PADDLE_ENFORCE_EQ(
cond_dims,
x_dims,
phi::errors::InvalidArgument(
"The dims of Inputs(Condition) and Inputs(X) should be same. "
"But received Condition's shape is [%s], X's shape is [%s]",
cond_dims,
x_dims));
PADDLE_ENFORCE_EQ(x_dims,
y_dims,
phi::errors::InvalidArgument(
"The dims of Inputs(X) and Inputs(Y) should be same. "
"But received X's shape is [%s], Y's shape is [%s]",
x_dims,
y_dims));
out->share_meta(x);
}
} // namespace phi
......@@ -25,9 +25,13 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
MetaTensor* out,
MetaConfig config = MetaConfig());
void ConcatInferMeta(const std::vector<MetaTensor>& x,
void ConcatInferMeta(const std::vector<MetaTensor*>& x,
const Scalar& axis_scalar,
MetaTensor* out,
MetaConfig config = MetaConfig());
void WhereInferMeta(const MetaTensor& condition,
const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out);
} // namespace phi
......@@ -459,8 +459,19 @@ void TransferLayoutInferMeta(const MetaTensor& x,
void SplitInferMeta(const MetaTensor& x,
const ScalarArray& num_or_sections,
const Scalar& axis,
std::vector<MetaTensor>* out,
std::vector<MetaTensor*> out,
MetaConfig config) {
if (!config.is_runtime) {
if (axis.FromTensor() || num_or_sections.FromTensor()) {
auto out_dims = phi::make_ddim(std::vector<int>(x.dims().size(), -1));
for (auto* item : out) {
item->set_dims(out_dims);
item->share_lod(x);
}
return;
}
}
int axis_value = axis.to<int>();
int rank = x.dims().size();
PADDLE_ENFORCE_EQ(
......@@ -475,15 +486,16 @@ void SplitInferMeta(const MetaTensor& x,
axis_value = axis_value + rank;
}
std::vector<phi::DDim> out_dims(out.size(), x.dims());
auto input_axis_dim = x.dims().at(axis_value);
auto num_or_sections_data = num_or_sections.GetData();
// step1: get formated sections
std::vector<int64_t> sections;
// num_or_sections is a number
if (num_or_sections_data.size() == 1) {
if (config.is_runtime || input_axis_dim > 0) {
int num = num_or_sections_data.at(0);
PADDLE_ENFORCE_EQ(input_axis_dim % num,
PADDLE_ENFORCE_EQ(
input_axis_dim % num,
0,
phi::errors::InvalidArgument(
"The input's size along the split dimension "
......@@ -494,8 +506,14 @@ void SplitInferMeta(const MetaTensor& x,
x.dims(),
axis_value));
for (int i = 0; i < num; ++i) {
sections.push_back(input_axis_dim / num);
size_t out_axis_dim = input_axis_dim / num;
for (auto& out_dim : out_dims) {
out_dim[axis_value] = out_axis_dim;
}
} else {
for (auto& out_dim : out_dims) {
out_dim[axis_value] = -1;
}
}
} else {
// num_or_sections is a sections
......@@ -503,10 +521,9 @@ void SplitInferMeta(const MetaTensor& x,
int unknow_dim_idx = -1;
int num_of_unknow = 0;
int sum_of_section = 0;
std::vector<int64_t> sections = num_or_sections_data;
for (size_t i = 0; i < num_or_sections_data.size(); ++i) {
sections.push_back(num_or_sections_data[i]);
if (num_or_sections_data[i] == unknow_dim_val) {
num_of_unknow++;
unknow_dim_idx = i;
......@@ -558,31 +575,22 @@ void SplitInferMeta(const MetaTensor& x,
x.dims(),
axis_value));
}
}
// setp2: fill out dims
std::vector<phi::DDim> out_dims(sections.size(), x.dims());
if (config.is_runtime || input_axis_dim > 0) {
for (size_t i = 0; i < sections.size(); ++i) {
for (size_t i = 0; i < out_dims.size(); ++i) {
out_dims[i][axis_value] = sections[i];
}
} else {
for (size_t i = 0; i < sections.size(); ++i) {
out_dims[i][axis_value] = -1;
}
}
for (size_t i = 0; i < sections.size(); ++i) {
for (size_t i = 0; i < out.size(); ++i) {
if (axis_value != 0) {
// Only pass LoD when not spliting along the first dim.
(*out)[i].set_dtype(x.dtype());
(*out)[i].set_dims(out_dims[i]);
(*out)[i].set_layout(x.layout());
out.at(i)->set_dtype(x.dtype());
out.at(i)->set_dims(out_dims[i]);
out.at(i)->set_layout(x.layout());
} else {
(*out)[i].set_dtype(x.dtype());
(*out)[i].set_dims(out_dims[i]);
(*out)[i].set_layout(x.layout());
(*out)[i].share_lod(x);
out.at(i)->set_dtype(x.dtype());
out.at(i)->set_dims(out_dims[i]);
out.at(i)->set_layout(x.layout());
out.at(i)->share_lod(x);
}
}
}
......
......@@ -107,7 +107,7 @@ void TransferLayoutInferMeta(const MetaTensor& x,
void SplitInferMeta(const MetaTensor& x_meta,
const ScalarArray& num_or_sections,
const Scalar& axis,
std::vector<MetaTensor>* out,
std::vector<MetaTensor*> out,
MetaConfig config = MetaConfig());
void UnbindInferMeta(const MetaTensor& x,
......
......@@ -31,13 +31,16 @@ DenseTensor Concat(const Context& dev_ctx,
const std::vector<DenseTensor>& x,
const Scalar& axis) {
std::vector<MetaTensor> meta_x;
meta_x.reserve(x.size());
std::vector<MetaTensor*> meta_x_ptr;
for (const auto& t : x) {
meta_x.emplace_back(t);
meta_x_ptr.push_back(&meta_x.back());
}
auto dense_out = phi::Empty<T, Context>(dev_ctx);
MetaTensor meta_out(&dense_out);
ConcatInferMeta(meta_x, axis.to<int>(), &meta_out, /*is_runtime=*/true);
ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
return dense_out;
}
......
......@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/atan2_grad_kernel.h"
#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
PD_REGISTER_KERNEL(atan2_grad,
CPU,
......
......@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/atan2_kernel.h"
#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
PD_REGISTER_KERNEL(atan2,
CPU,
......
......@@ -37,6 +37,7 @@ void ConcatKernel(const Context& dev_ctx,
axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
std::vector<phi::DDim> x_dims;
x_dims.reserve(x.size());
for (size_t i = 0; i < x.size(); ++i) {
x_dims.push_back(x[i].dims());
}
......@@ -97,9 +98,10 @@ void ConcatKernel(const Context& dev_ctx,
}
} else {
std::vector<phi::DenseTensor> inputs;
inputs.reserve(x.size());
for (size_t j = 0; j < x.size(); ++j) {
if (x[j].numel() > 0) {
inputs.push_back(x[j]);
inputs.emplace_back(x[j]);
} else {
continue;
}
......
此差异已折叠。
此差异已折叠。
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/logical_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/elementwise.h"
#include "paddle/phi/kernels/funcs/logical_functor.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/transform.h"
namespace phi {
#define DEFINE_LOGICAL_BINARY_KERNEL(type) \
template <typename T, typename Context> \
void Logical##type##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
DenseTensor* out) { \
funcs::Logical##type##Functor<T> binary_func; \
ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
dev_ctx, x, y, -1, binary_func, out); \
}
DEFINE_LOGICAL_BINARY_KERNEL(And)
DEFINE_LOGICAL_BINARY_KERNEL(Or)
DEFINE_LOGICAL_BINARY_KERNEL(Xor)
#undef DEFINE_LOGICAL_BINARY_KERNEL
template <typename T, typename Context>
void LogicalNotKernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out) {
auto* out_ptr = dev_ctx.template Alloc<bool>(out);
funcs::LogicalNotFunctor<T> unary_func;
paddle::platform::Transform<Context> trans;
trans(dev_ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func);
}
} // namespace phi
#define REGISTER_LOGICAL_CPU_KERNEL(logical_and, func_type) \
PD_REGISTER_KERNEL(logical_and, \
CPU, \
ALL_LAYOUT, \
phi::Logical##func_type##Kernel, \
float, \
double, \
bool, \
int64_t, \
int, \
int8_t, \
int16_t) {}
REGISTER_LOGICAL_CPU_KERNEL(logical_and, And)
REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or)
REGISTER_LOGICAL_CPU_KERNEL(logical_not, Not)
REGISTER_LOGICAL_CPU_KERNEL(logical_xor, Xor)
......@@ -28,20 +28,6 @@ void SplitKernel(const Context& dev_ctx,
const ScalarArray& num_or_sections,
const Scalar& axis_scalar,
std::vector<DenseTensor*> outs) {
// need to infershape output
if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
std::vector<MetaTensor> out_metas;
for (size_t i = 0; i < outs.size(); ++i) {
out_metas.push_back(outs[i]);
}
phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
for (size_t i = 0; i < out_metas.size(); ++i) {
outs[i]->Resize(out_metas[i].dims());
}
}
std::vector<const DenseTensor*> shape_refer;
for (size_t j = 0; j < outs.size(); ++j) {
dev_ctx.template Alloc<T>(outs[j]);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册