提交 c1c6b869 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

......@@ -1256,7 +1256,7 @@ if __name__ == "__main__":
# Node Definition Generation
definition_declaration_pair = GenerateForwardDefinition(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
forward_outputs_position_map, orig_forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
intermediate_outputs)
......@@ -1268,7 +1268,7 @@ if __name__ == "__main__":
# For python-level API dispatch
CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
forward_outputs_position_map,
forward_attrs_list)
orig_forward_attrs_list)
if len(namespace) > 0:
forward_definition_str += f"""namespace {namespace} {{
......
......@@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
return;
}
// NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
paddle::framework::TensorCopy(
in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
out);
return;
}
// NOTE(yy): TransDataDevice should wait for computation of input.
if (!platform::is_cuda_pinned_place(in.place())) {
platform::DeviceContextPool::Instance().Get(in.place())->Wait();
......
......@@ -95,6 +95,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromBlock(
std::unordered_map<std::string, std::pair<VarDesc *, int>>
name_to_desc_block_id;
block_id_ = block.ID();
const BlockDesc *block_var_visible = &block;
while (block_var_visible != nullptr) {
for (auto *var : block_var_visible->AllVars()) {
......
......@@ -230,6 +230,7 @@ class Graph {
auto *x =
AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id));
x->SetId(num_node_created_++);
x->SetGraphId(block_id_);
return x;
}
......@@ -245,6 +246,7 @@ class Graph {
"The OpDesc used to create operator node is null."));
auto *x = AddNode(new ir::Node(op_desc));
x->SetId(num_node_created_++);
x->SetGraphId(block_id_);
return x;
}
......@@ -263,6 +265,7 @@ class Graph {
num_node_created_);
auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_));
x->SetId(num_node_created_++);
x->SetGraphId(block_id_);
return x;
}
......@@ -276,6 +279,7 @@ class Graph {
}
auto *x = AddNode(new ir::Node(name, type, block_id_));
x->SetId(num_node_created_++);
x->SetGraphId(block_id_);
return x;
}
......
......@@ -125,6 +125,7 @@ class Node {
// Only use this for auto parallel.
// A node does not have original desc if the return is zero.
uint64_t OriginalDescId() const { return original_desc_id_; }
int GraphId() const { return graph_id_; }
bool IsOp() const { return type_ == Type::kOperation; }
bool IsVar() const { return type_ == Type::kVariable; }
......@@ -246,10 +247,12 @@ class Node {
// Store the original id of var desc or op desc.
// Only use this for auto parallel.
uint64_t original_desc_id_{0};
int graph_id_{-1};
private:
// ID can only set by a Graph.
void SetId(int id) { id_ = id; }
void SetGraphId(int graph_id) { graph_id_ = graph_id; }
// desc_order can only set by a Graph when constructing a Graph from a
// BlockDesc.
......
......@@ -1456,7 +1456,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_XPU
#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
if (platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
......@@ -1470,18 +1471,37 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
#endif
#ifdef PADDLE_WITH_XPU_KP
if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
bool use_xpu_kp_kernel_rt =
FLAGS_run_kp_kernel &&
paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
bool use_xpu_kp_kernel_debug =
paddle::platform::is_in_xpu_kpwhite_list(type_);
if (platform::is_xpu_place(expected_kernel_key.place_) &&
(use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
if (use_xpu_kp_kernel_rt) {
VLOG(3) << "xpu_kp using rt mode ";
}
if (use_xpu_kp_kernel_debug) {
VLOG(3) << "xpu_kp using debug mode ";
}
bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
if (is_xpu_kp_support) {
expected_kernel_key.library_type_ = LibraryType::kKP;
kernel_iter = kernels.find(expected_kernel_key);
VLOG(3) << "using XPU KP kernel: " << type_
<< ", using_kernel_key:" << expected_kernel_key;
}
bool is_xpu_unsupport =
(!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(type_));
if (!is_xpu_kp_support &&
(kernel_iter == kernels.end() || is_xpu_unsupport)) {
VLOG(3) << "missing XPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
}
#endif
#ifdef PADDLE_WITH_IPU
......
......@@ -1224,8 +1224,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
proto::VarType::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
int32_t size = -1;
is.read(reinterpret_cast<char*>(&size), sizeof(size));
PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable(
"Cannot read tensor desc size"));
PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument(
"Tensor desc size should >= 0"));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char*>(buf.get()), size);
PADDLE_ENFORCE_EQ(
......
......@@ -124,7 +124,7 @@ AmpOperators::AmpOperators()
OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
unsupported_ops_gpu_bf16.end());
// NOTE: GPU/NPU/XPU is compiled seperatly.
// NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
#elif defined(PADDLE_WITH_ASCEND_CL)
auto unsupported_ops_npu_fp16 = std::get<2>(
OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
......@@ -143,6 +143,15 @@ AmpOperators::AmpOperators()
OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
unsupported_ops_xpu_bf16.end());
#elif defined(PADDLE_WITH_MLU)
auto unsupported_ops_mlu_fp16 = std::get<2>(
OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
unsupported_ops_mlu_fp16.end());
auto unsupported_ops_mlu_bf16 = std::get<2>(
OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
unsupported_ops_mlu_bf16.end());
#endif
VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
<< unsupported_fp16_ops_->size() << " "
......@@ -210,6 +219,7 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
if (paddle::platform::is_gpu_place(place) ||
paddle::platform::is_cuda_pinned_place(place) ||
paddle::platform::is_xpu_place(place) ||
paddle::platform::is_mlu_place(place) ||
paddle::platform::is_npu_place(place) ||
paddle::platform::is_npu_pinned_place(place)) {
// CudaPinndePlace is added for varbase created by dataloader
......
......@@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() || is_xpu_unsupport)) {
VLOG(3) << "missing XPU kernel: " << op.Type()
......@@ -243,11 +243,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_XPU_KP
expected_kernel_key.place_ = platform::XPUPlace();
if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
bool use_xpu_kp_kernel_rt =
FLAGS_run_kp_kernel &&
paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
......@@ -259,14 +258,22 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
if (use_xpu_kp_kernel_debug) {
VLOG(3) << "xpu_kp using debug mode ";
}
if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
(use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
expected_kernel_key.place_ = platform::XPUPlace();
bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
if (is_xpu_kp_support) {
expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
kernel_iter = kernels.find(expected_kernel_key);
VLOG(3) << "using XPU KP kernel: " << op.Type()
<< ", using_kernel_key:" << expected_kernel_key;
}
if (!is_xpu_kp_support &&
(kernel_iter == kernels.end() || is_xpu_unsupport)) {
VLOG(3) << "missing XPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......
......@@ -341,7 +341,6 @@ void BuildDygraphPhiKernelContext(
}
for (size_t i = 0; i < attr_names.size(); ++i) {
VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
if (attrs.find(attr_names[i]) !=
attrs.end()) { // shape is in the attribute
......
......@@ -1485,6 +1485,13 @@ REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
ThresholdedReluFunctor, ThresholdedReluGradFunctor);
REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor,
HardShrinkGradFunctor);
REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
SoftShrinkGradFunctor);
REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor,
TanhShrinkGradFunctor);
REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor);
/* ========================== sigmoid register =============================
*/
......@@ -1626,22 +1633,6 @@ REGISTER_OPERATOR(
ops::ActivationOpDoubleGrad<ops::ELUGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(elu,
ops::ActivationKernel<paddle::platform::CPUDeviceContext,
ops::ELUFunctor<float>>,
ops::ActivationKernel<paddle::platform::CPUDeviceContext,
ops::ELUFunctor<double>>);
REGISTER_OP_CPU_KERNEL(
elu_grad, ops::ELUGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::ELUGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
elu_grad_grad, ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
ops::ELUGradGradFunctor<float>>,
ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
ops::ELUGradGradFunctor<double>>,
ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
ops::ELUGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ======================== logit register ============================
......
......@@ -279,6 +279,15 @@ USE_PHI_FUNCTOR(BRelu)
USE_PHI_FUNCTOR(ThresholdedRelu)
USE_PHI_FUNCTOR(LeakyRelu)
USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
USE_PHI_FUNCTOR(HardShrink)
USE_PHI_FUNCTOR(SoftShrink)
USE_PHI_FUNCTOR(TanhShrink)
USE_PHI_FUNCTOR(Silu)
USE_PHI_FUNCTOR(ELU)
USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU)
template <typename T>
using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
template <typename T>
struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
......@@ -392,31 +401,6 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
}
};
// silu(x) = x / (1 + exp(-x))
template <typename T>
struct SiluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
out.device(d) = x * temp;
}
};
// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x}))
template <typename T>
struct SiluGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = static_cast<T>(1) + (-x).exp(); // 1+e^(-x)
auto temp2 = x * (-x).exp(); // x*e^(-x)
dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
(static_cast<T>(1) + (temp2 / temp1)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// Originally: logsigmoid(x) = -log (1 + exp(-x))
// For numerical stability, we can use the log-sum-exp trick:
// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
......@@ -512,99 +496,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
template <typename T>
using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x - x.tanh();
}
};
template <typename T>
struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (x.tanh() * x.tanh());
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct HardShrinkFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = x > static_cast<T>(threshold);
out.device(d) = x * (temp1 || temp2).template cast<T>();
}
};
template <typename T>
struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = x > static_cast<T>(threshold);
dx.device(d) = dout * (temp1 || temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
// otherwise
template <typename T>
struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto lambdaT = static_cast<T>(lambda);
auto temp1 = (x > lambdaT).template cast<T>();
auto temp2 = (x < -lambdaT).template cast<T>();
out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
}
};
template <typename T>
struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto lambdaT = static_cast<T>(lambda);
auto temp1 = (x > lambdaT).template cast<T>();
auto temp2 = (x < -lambdaT).template cast<T>();
dx.device(d) = dout * (temp1 + temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// sqrt(x) = x^(1/2)
template <typename T>
struct SqrtFunctor : public BaseActivationFunctor<T> {
......@@ -1036,59 +927,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct ELUFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) =
(x < static_cast<T>(0))
.select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
}
};
template <typename T>
struct ELUGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
// case 1: alpha >= 0
// dx = dout, if out > 0
// dx = dout * (out + alpha), if out <= 0
dx.device(d) = (out > static_cast<T>(0))
.select(dout, dout * (out + static_cast<T>(alpha)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
// case 2: alpha < 0
// dx = dout, if x > 0
// dx = dout * (out + alpha), if x <=0
dx.device(d) = (x > static_cast<T>(0))
.select(dout, dout * static_cast<T>(alpha) * x.exp());
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename DeviceContext, typename T>
class ELUGradKernel : public framework::OpKernel<T> {
public:
......@@ -1354,44 +1192,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* ddX, framework::Tensor* ddOut,
const framework::Tensor* dOut, framework::Tensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
if (dX) {
auto dx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
(x <= static_cast<T>(0)).template cast<T>();
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
ddout.device(*d) = ddx *
((x > static_cast<T>(0)).template cast<T>() +
static_cast<T>(alpha) * x.exp() *
(x <= static_cast<T>(0)).template cast<T>())
.template cast<T>();
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
float alpha;
......@@ -2152,9 +1952,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
} // namespace paddle
#define FOR_EACH_ACTIVATION_OP(__macro) \
__macro(silu, Silu, SiluFunctor, SiluGradFunctor); \
__macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
__macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \
__macro(round, Round, RoundFunctor, ZeroGradFunctor); \
......@@ -2167,8 +1965,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
__macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \
__macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \
__macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \
__macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
__macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
__macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \
HardSigmoidGradFunctor); \
__macro(swish, Swish, SwishFunctor, SwishGradFunctor); \
......
......@@ -44,35 +44,6 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct CudaSiluFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// silu(x) = x / (1 + exp(-x))
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(x / (one + exp(-x)));
}
};
template <typename T>
struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x);
MPType temp = one / (one + exp(-x));
return static_cast<T>(dout * (temp * (one + x * (one - temp))));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
......@@ -110,43 +81,6 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
// softshrink(x) = x - lambda, if x > lambda;
// x + lambda, if x < -lambda;
// 0, otherwise.
__device__ __forceinline__ T operator()(const T x) const {
T l = static_cast<T>(lambda);
T temp1 = static_cast<T>(x > l);
T temp2 = static_cast<T>(x < -l);
return temp1 * (x - l) + temp2 * (x + l);
}
};
template <typename T>
struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float lambda;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"lambda", &lambda}};
}
// dx = dout, if x > lambda or x < -lambda else 0
__device__ __forceinline__ T operator()(const T dout, const T x) const {
T l = static_cast<T>(lambda);
return (x >= -l && x <= l) ? zero : dout;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaCeilFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
......@@ -615,66 +549,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
// tanhshrink(x) = x - tanh(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(x - tanh(x));
}
};
template <typename T>
struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
// dx = dout * tanh(x)^2
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(dout * tanh(x) * tanh(x));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
__device__ __forceinline__ T operator()(const T x) const {
T t = static_cast<T>(threshold);
return (x > -t && x < t) ? zero : x;
}
};
template <typename T>
struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// dx = (x > -threshold && x < threshold) ? 0 : dout
__device__ __forceinline__ T operator()(const T dout, const T x) const {
T t = static_cast<T>(threshold);
return (x > -t && x < t) ? zero : dout;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
......@@ -863,110 +737,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaELUFunctor : public BaseActivationFunctor<T> {
using CT = typename details::MPTypeTrait<T>::Type;
CT zero = static_cast<CT>(0.0f);
CT one = static_cast<CT>(1.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// elu(x) = x, if x > 0
// elu(x) = alpha * (e^x - 1), if x <= 0
__device__ __forceinline__ T operator()(const T arg_x) const {
CT x = static_cast<CT>(arg_x);
CT temp = static_cast<CT>(alpha) * (exp(x) - one);
CT res = x > zero ? x : temp;
return static_cast<T>(res);
}
};
template <typename T>
struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// case 1: alpha >= 0
// dx = dout, if out > 0
// dx = dout * (out + alpha), if out <= 0
__device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType out = static_cast<MPType>(arg_out);
MPType a = static_cast<MPType>(alpha);
MPType out_pos = static_cast<MPType>(out > zero);
MPType out_neg = static_cast<MPType>(out <= zero);
return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// case 2: alpha < 0
// dx = dout, if x > 0
// dx = dout * (out + alpha), if x <=0
__device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType out = static_cast<MPType>(arg_out);
MPType x = static_cast<MPType>(arg_x);
MPType a = static_cast<MPType>(alpha);
MPType x_pos = static_cast<MPType>(x > zero);
MPType x_neg = static_cast<MPType>(x <= zero);
return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename DeviceContext, typename T>
class ELUGradCudaKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* out = ctx.Input<framework::Tensor>("Out");
auto* x = ctx.Input<framework::Tensor>("X");
auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
d_x->mutable_data<T>(ctx.GetPlace());
const float alpha = ctx.Attr<float>("alpha");
auto& dev_ctx = ctx.device_context<DeviceContext>();
std::vector<const framework::Tensor*> ins = {d_out, out};
std::vector<framework::Tensor*> outs = {d_x};
if (alpha > 0) {
CudaELUGradFunctor<T> functor;
functor.alpha = alpha;
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
} else {
CudaELUGradNegativeAlphaFunctor<T> functor;
functor.alpha = alpha;
ins.push_back(x);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
}
};
template <typename T>
struct CudaCELUFunctor : public BaseActivationFunctor<T> {
using CT = typename details::MPTypeTrait<T>::Type;
......@@ -1099,6 +869,15 @@ USE_PHI_FUNCTOR(CudaTanh)
USE_PHI_FUNCTOR(CudaBRelu)
USE_PHI_FUNCTOR(CudaLeakyRelu)
USE_PHI_FUNCTOR(CudaThresholdedRelu)
USE_PHI_FUNCTOR(CudaHardShrink)
USE_PHI_FUNCTOR(CudaSoftShrink)
USE_PHI_FUNCTOR(CudaTanhShrink)
USE_PHI_FUNCTOR(CudaSilu)
USE_PHI_FUNCTOR(CudaELU)
template <typename T>
using CudaELUGradNegativeAlphaFunctor =
phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
} // namespace operators
} // namespace paddle
......@@ -1158,26 +937,6 @@ namespace plat = paddle::platform;
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::bfloat16>>);
/* ======================== elu register ============================ */
REGISTER_OP_CUDA_KERNEL(
elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
ops::CudaELUFunctor<float>>,
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
ops::CudaELUFunctor<double>>,
ops::ActivationCudaKernel<plat::CUDADeviceContext,
ops::CudaELUFunctor<plat::float16>>);
REGISTER_OP_CUDA_KERNEL(
elu_grad, ops::ELUGradCudaKernel<plat::CUDADeviceContext, float>,
ops::ELUGradCudaKernel<plat::CUDADeviceContext, double>,
ops::ELUGradCudaKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
ops::ELUGradGradFunctor<float>>,
ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
ops::ELUGradGradFunctor<double>>,
ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
ops::ELUGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ======================== celu register ============================ */
......@@ -1359,7 +1118,6 @@ REGISTER_OP_CUDA_KERNEL(
/* ========================================================================== */
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \
__macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \
__macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \
CudaLogSigmoidGradFunctor); \
__macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
......@@ -20,6 +21,8 @@ namespace operators {
template <typename T>
class MLUBatchNormOpKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto &place = ctx.GetPlace();
......@@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
// alloc memory
y->mutable_data<T>(place);
mean_out->mutable_data<T>(place);
variance_out->mutable_data<T>(place);
saved_mean->mutable_data<T>(place);
saved_variance->mutable_data<T>(place);
mean_out->mutable_data<MPDType>(place);
variance_out->mutable_data<MPDType>(place);
saved_mean->mutable_data<MPDType>(place);
saved_variance->mutable_data<MPDType>(place);
Tensor transformed_x;
Tensor transformed_y;
......@@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
template <typename T>
class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
......@@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
auto d_x_tmp =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
auto scale_grad_tmp =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(scale->dims(), dev_ctx);
auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
scale->dims(), dev_ctx);
auto bias_grad_tmp =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(bias->dims(), dev_ctx);
ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
if (d_x == nullptr) {
d_x = &d_x_tmp;
......@@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
const auto &place = ctx.GetPlace();
d_x->mutable_data<T>(place);
d_scale->mutable_data<T>(place);
d_bias->mutable_data<T>(place);
d_scale->mutable_data<MPDType>(place);
d_bias->mutable_data<MPDType>(place);
use_global_stats = is_test || use_global_stats;
......
......@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
......@@ -21,14 +23,6 @@ namespace operators {
class CumprodOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod");
ctx->ShareDim("X", "Out");
ctx->ShareLoD("X", "Out");
}
};
class CumprodOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -82,9 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor,
PD_INFER_META(phi::UnchangedInferMeta));
REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker,
ops::CumprodGradOpMaker<paddle::framework::OpDesc>,
ops::CumprodGradOpMaker<paddle::imperative::OpBase>);
ops::CumprodGradOpMaker<paddle::imperative::OpBase>,
CumprodInferShapeFunctor);
REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp);
......@@ -15,9 +15,14 @@ limitations under the License. */
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
......@@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of GatherOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
platform::errors::InvalidArgument(
"Input(Index) of GatherOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::InvalidArgument(
"Output(Out) of GatherOp should not be null."));
auto index_dims = ctx->GetInputDim("Index");
if (index_dims.size() == 2) {
PADDLE_ENFORCE_EQ(
index_dims[1], 1,
platform::errors::InvalidArgument(
"The last dim of index should be 1 when it is 2D, but we get %d",
index_dims[1]));
} else {
PADDLE_ENFORCE_EQ(
index_dims.size(), 1,
platform::errors::InvalidArgument(
"The index should be 1D, when it is not 2D, but we get %d",
index_dims.size()));
}
auto axis = ctx->Attrs().Get<int>("axis");
auto input_dim = ctx->GetInputDim("X");
if (ctx->HasInput("Axis") || axis == 0) {
// if HasInput("Axis"), we can not obtain correct shape of output
int batch_size = index_dims[0];
framework::DDim output_dims(input_dim);
output_dims[0] = batch_size;
ctx->SetOutputDim("Out", output_dims);
ctx->ShareLoD("X", /*->*/ "Out");
} else {
int index_size = index_dims[0];
std::vector<int> out_dim_vec;
for (int i = 0; i < axis; i++) {
out_dim_vec.push_back(input_dim[i]);
}
out_dim_vec.push_back(index_size);
for (int i = axis + 1; i < input_dim.size(); i++) {
out_dim_vec.push_back(input_dim[i]);
}
auto output_dims = phi::make_ddim(out_dim_vec);
ctx->SetOutputDim("Out", output_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -193,11 +141,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X");
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor,
PD_INFER_META(phi::GatherInferMeta));
REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
ops::GatherGradOpMaker<paddle::framework::OpDesc>,
ops::GatherGradOpMaker<paddle::imperative::OpBase>);
ops::GatherGradOpMaker<paddle::imperative::OpBase>,
GatherInferShapeFunctor);
DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor,
PD_INFER_META(phi::GeneralUnaryGradInferMeta));
REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
ops::GatherGradNoNeedBufferVarInferer);
ops::GatherGradNoNeedBufferVarInferer,
GatherGradInferShapeFunctor);
REGISTER_OP_VERSION(gather)
.AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
......
......@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/grid_sampler_op.h"
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
......@@ -229,15 +229,6 @@ REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
ops::GridSampleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
REGISTER_OP_CPU_KERNEL(
grid_sampler,
ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
grid_sampler_grad,
ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_VERSION(grid_sampler)
.AddCheckpoint(
R"ROC(
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/grid_sampler_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
return h >= 0 && h < H && w >= 0 && w < W;
}
template <typename T>
static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
int sW, int H, int W,
T delta) {
if (in_bounds(h, w, H, W)) {
platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
}
}
template <typename T>
static __forceinline__ __device__ T _unnormalize(T coord, int size,
bool align_corners) {
if (align_corners) {
return ((coord + 1.f) / 2) * (size - 1);
} else {
return ((coord + 1.f) * size - 1) / 2;
}
}
template <typename T>
static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
}
template <typename T>
static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
int twice_high) {
if (twice_low == twice_high) {
return static_cast<T>(0);
}
T min = static_cast<T>(twice_low) / 2;
T span = static_cast<T>(twice_high - twice_low) / 2;
in = fabs(in - min);
T extra = fmod(in, span);
int flips = static_cast<int>(floor(in / span));
if (flips % 2 == 0) {
return extra + min;
} else {
return span - extra + min;
}
}
template <typename T>
static __forceinline__ __device__ T compute_positions(T coord, int size,
PaddingMode padding_mode,
bool align_corners) {
coord = _unnormalize<T>(coord, size, align_corners);
if (padding_mode == PaddingMode::border) {
coord = clip_indexes(coord, size - 1);
} else if (padding_mode == PaddingMode::reflect) {
if (align_corners) {
coord = reflect_indexes(coord, 0, 2 * (size - 1));
} else {
coord = reflect_indexes(coord, -1, 2 * size - 1);
}
coord = clip_indexes(coord, size - 1);
}
return coord;
}
template <typename T>
static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
bool align_corners,
T* grad_in) {
if (align_corners) {
*grad_in = static_cast<T>(size - 1) / 2;
return ((coord + 1.f) / 2) * (size - 1);
} else {
*grad_in = static_cast<T>(size) / 2;
return ((coord + 1.f) * size - 1) / 2;
}
}
template <typename T>
static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
T* grad_in) {
if (in <= static_cast<T>(0)) {
*grad_in = static_cast<T>(0);
return static_cast<T>(0);
} else {
T max = static_cast<T>(clip_limit - 1);
if (in >= max) {
*grad_in = static_cast<T>(0);
return max;
} else {
*grad_in = static_cast<T>(1);
return in;
}
}
}
template <typename T>
static __forceinline__ __device__ T
reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
if (twice_low == twice_high) {
*grad_in = static_cast<T>(0);
return static_cast<T>(0);
}
int grad_in_mult_;
T min = static_cast<T>(twice_low) / 2;
T span = static_cast<T>(twice_high - twice_low) / 2;
in = in - min;
if (in < static_cast<T>(0)) {
grad_in_mult_ = -1;
in = -in;
} else {
grad_in_mult_ = 1;
}
T extra = fmod(in, span);
int flips = static_cast<int>(floor(in / span));
if (flips % 2 == 0) {
*grad_in = static_cast<T>(grad_in_mult_);
return extra + min;
} else {
*grad_in = static_cast<T>(-grad_in_mult_);
return span - extra + min;
}
}
template <typename T>
static __forceinline__ __device__ T
compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
bool align_corners, T* grad_in) {
T grad_clip, grad_refl;
coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
if (padding_mode == PaddingMode::border) {
coord = clip_indexes_with_mask(coord, size, &grad_clip);
*grad_in = (*grad_in) * grad_clip;
} else if (padding_mode == PaddingMode::reflect) {
if (align_corners) {
coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
} else {
coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
}
coord = clip_indexes_with_mask(coord, size, &grad_clip);
*grad_in = (*grad_in) * grad_refl * grad_clip;
}
return coord;
}
template <typename T>
__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
int out_h, int out_w, int in_h,
int in_w, const T* input, const T* grid,
T* output, const Mode mode,
const PaddingMode padding_mode,
bool align_corners) {
int inp_sN = out_c * in_h * in_w;
int inp_sC = in_h * in_w;
int inp_sH = in_w;
int inp_sW = 1;
int grid_sN = out_h * out_w * 2;
int grid_sH = out_w * 2;
int grid_sW = 2;
int grid_sCoor = 1;
int out_sN = out_c * out_h * out_w;
int out_sC = out_h * out_w;
int out_sH = out_w;
int out_sW = 1;
CUDA_KERNEL_LOOP(index, nthreads) {
const int w = index % out_w;
const int h = (index / out_w) % out_h;
const int n = index / (out_h * out_w);
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
T ix = grid[grid_offset];
T iy = grid[grid_offset + grid_sCoor];
ix = compute_positions(ix, in_w, padding_mode, align_corners);
iy = compute_positions(iy, in_h, padding_mode, align_corners);
if (mode == Mode::bilinear) {
int ix_nw = static_cast<int>(floor(ix));
int iy_nw = static_cast<int>(floor(iy));
int ix_ne = ix_nw + 1;
int iy_ne = iy_nw;
int ix_sw = ix_nw;
int iy_sw = iy_nw + 1;
int ix_se = ix_nw + 1;
int iy_se = iy_nw + 1;
T nw = (ix_se - ix) * (iy_se - iy);
T ne = (ix - ix_sw) * (iy_sw - iy);
T sw = (ix_ne - ix) * (iy - iy_ne);
T se = (ix - ix_nw) * (iy - iy_nw);
auto inp_offset_NC = n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < out_c;
++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
*out_ptr_NCHW = static_cast<T>(0);
if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
}
if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
}
if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
}
if (in_bounds(iy_se, ix_se, in_h, in_w)) {
*out_ptr_NCHW +=
input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
}
}
} else if (mode == Mode::nearest) {
int ix_nearest = static_cast<int>(std::nearbyint(ix));
int iy_nearest = static_cast<int>(std::nearbyint(iy));
auto inp_offset_NC = n * inp_sN;
auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
for (int c = 0; c < out_c;
++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
*out_ptr_NCHW =
input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
} else {
*out_ptr_NCHW = static_cast<T>(0);
}
}
}
}
}
template <typename T>
class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.cuda_device_context();
auto align_corners = ctx.Attr<bool>("align_corners");
auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
auto mode_s = ctx.Attr<std::string>("mode");
PaddingMode padding_mode;
Mode mode;
if (padding_mode_s == "border") {
padding_mode = PaddingMode::border;
} else if (padding_mode_s == "reflection") {
padding_mode = PaddingMode::reflect;
} else {
padding_mode = PaddingMode::zeros;
}
if (mode_s == "nearest") {
mode = Mode::nearest;
} else {
mode = Mode::bilinear;
}
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
const int n = grid->dims()[0];
const int out_h = grid->dims()[1];
const int out_w = grid->dims()[2];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
<< "; out_w: " << out_w;
auto* output = ctx.Output<Tensor>("Output");
auto* output_data = output->mutable_data<T>(ctx.GetPlace());
VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
<< "; " << output->dims()[2] << "; " << output->dims()[3];
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, count);
grid_sample_cuda_kernel<
T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
grid->data<T>(), output_data, mode, padding_mode, align_corners);
}
};
template <typename T>
__global__ void grid_sampler_cuda_backward_kernel(
const int nthreads, const T* grad_output, const T* input, const T* grid,
int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
T* grad_grid, const Mode mode, const PaddingMode padding_mode,
bool align_corners) {
int inp_sN = out_c * in_h * in_w;
int inp_sC = in_h * in_w;
int inp_sH = in_w;
int inp_sW = 1;
int grid_sN = out_h * out_w * 2;
int grid_sH = out_w * 2;
int grid_sW = 2;
int grid_sCoor = 1;
int gOut_sN = out_c * out_h * out_w;
int gOut_sC = out_h * out_w;
int gOut_sH = out_w;
int gOut_sW = 1;
CUDA_KERNEL_LOOP(index, nthreads) {
const int w = index % out_w;
const int h = (index / out_w) % out_h;
const int n = index / (out_h * out_w);
const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
T ix = grid[grid_offset];
T iy = grid[grid_offset + grid_sCoor];
T gix_mult, giy_mult;
ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
&gix_mult);
iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
&giy_mult);
if (mode == Mode::bilinear) {
int ix_nw = static_cast<int>(floor(ix));
int iy_nw = static_cast<int>(floor(iy));
int ix_ne = ix_nw + 1;
int iy_ne = iy_nw;
int ix_sw = ix_nw;
int iy_sw = iy_nw + 1;
int ix_se = ix_nw + 1;
int iy_se = iy_nw + 1;
T nw = (ix_se - ix) * (iy_se - iy);
T ne = (ix - ix_sw) * (iy_sw - iy);
T sw = (ix_ne - ix) * (iy - iy_ne);
T se = (ix - ix_nw) * (iy - iy_nw);
T gix = static_cast<T>(0), giy = static_cast<T>(0);
int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
T* gInp_ptr_NC = grad_input + n * inp_sN;
int inp_offset_NC = n * inp_sN;
for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
T gOut = grad_output[gOut_offset];
atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
nw * gOut);
atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
ne * gOut);
atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
sw * gOut);
atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
se * gOut);
if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
gix -= nw_val * (iy_se - iy) * gOut;
giy -= nw_val * (ix_se - ix) * gOut;
}
if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
gix += ne_val * (iy_sw - iy) * gOut;
giy -= ne_val * (ix - ix_sw) * gOut;
}
if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
gix -= sw_val * (iy - iy_ne) * gOut;
giy += sw_val * (ix_ne - ix) * gOut;
}
if (in_bounds(iy_se, ix_se, in_h, in_w)) {
T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
gix += se_val * (iy - iy_nw) * gOut;
giy += se_val * (ix - ix_nw) * gOut;
}
}
if (grad_grid != nullptr) {
T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
gGrid_ptr_NHW[0] = gix_mult * gix;
gGrid_ptr_NHW[1] = giy_mult * giy;
}
} else if (mode == Mode::nearest) {
int ix_nearest = static_cast<int>(std::nearbyint(ix));
int iy_nearest = static_cast<int>(std::nearbyint(iy));
int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
T* gInp_ptr_NC = grad_input + n * inp_sN;
for (int c = 0; c < out_c;
++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
in_w, grad_output[gOut_offset]);
}
if (grad_grid != nullptr) {
T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
gGrid_ptr_NHW[0] = static_cast<T>(0);
gGrid_ptr_NHW[1] = static_cast<T>(0);
}
}
}
}
template <typename T>
class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.cuda_device_context();
auto align_corners = ctx.Attr<bool>("align_corners");
auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
auto mode_s = ctx.Attr<std::string>("mode");
PaddingMode padding_mode;
Mode mode;
if (padding_mode_s == "border") {
padding_mode = PaddingMode::border;
} else if (padding_mode_s == "reflection") {
padding_mode = PaddingMode::reflect;
} else {
padding_mode = PaddingMode::zeros;
}
if (mode_s == "nearest") {
mode = Mode::nearest;
} else {
mode = Mode::bilinear;
}
auto* input = ctx.Input<Tensor>("X");
auto* grid = ctx.Input<Tensor>("Grid");
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
const int n = grid->dims()[0];
const int out_h = grid->dims()[1];
const int out_w = grid->dims()[2];
const int c = input->dims()[1];
const int in_h = input->dims()[2];
const int in_w = input->dims()[3];
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>(ctx.GetPlace());
phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
ctx.template device_context<paddle::platform::CUDADeviceContext>(),
input_grad, static_cast<T>(0));
T* grid_grad_data = nullptr;
if (ctx.HasOutput(framework::GradVarName("Grid"))) {
auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
}
int count = static_cast<int>(n * out_h * out_w);
auto cu_stream = dev_ctx.stream();
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, count);
grid_sampler_cuda_backward_kernel<
T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
padding_mode, align_corners);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
ops::GridSampleOpCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
ops::GridSampleGradOpCUDAKernel<float>,
ops::GridSampleGradOpCUDAKernel<double>);
此差异已折叠。
......@@ -13,8 +13,13 @@
// limitations under the License.
#include "paddle/fluid/operators/index_select_op.h"
#include <memory>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
......@@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of IndexSelectOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
platform::errors::InvalidArgument(
"Input(Index) of IndexSelectOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::InvalidArgument(
"Output(Out) of IndexSelectOp should not be null."));
auto input_dim = ctx->GetInputDim("X");
auto index_dim = ctx->GetInputDim("Index");
auto dim = ctx->Attrs().Get<int>("dim");
PADDLE_ENFORCE_EQ(
dim < input_dim.size() && dim >= (0 - input_dim.size()), true,
platform::errors::OutOfRange(
"Attr(dim) is out of range, It's expected "
"to be in range of [-%d, %d]. But received Attr(dim) = %d.",
input_dim.size(), input_dim.size() - 1, dim));
PADDLE_ENFORCE_EQ(
index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
true, platform::errors::InvalidArgument(
"The 'shape' of Input(Index) must be 1-D tensor. "
"But received: the 'shape' of Input(Index) is [%s], "
"the dimension of Input(Index) is [%d].",
index_dim, index_dim.size()));
PADDLE_ENFORCE_EQ(index_dim[0] != 0, true,
platform::errors::InvalidArgument(
"The length of Input(Index) can't be 0."));
auto output_dim = phi::vectorize(input_dim);
if (dim < 0) {
dim += input_dim.size();
}
output_dim[dim] = index_dim[0];
ctx->SetOutputDim("Out", phi::make_ddim(output_dim));
auto type = ctx->GetInputsVarType("X")[0];
if (type == framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer,
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor,
PD_INFER_META(phi::IndexSelectInferMeta));
REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker,
ops::IndexSelectGradMaker<paddle::framework::OpDesc>,
ops::IndexSelectGradMaker<paddle::imperative::OpBase>);
ops::IndexSelectGradMaker<paddle::imperative::OpBase>,
IndexSelectInferShapeFunctor);
REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp,
ops::IndexSelectGradNoNeedBufferVarsInferer);
REGISTER_OP_CPU_KERNEL(
index_select,
ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
index_select_grad,
ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/index_select_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, typename IndexT>
__global__ void index_select_cuda_kernel(const T* input, T* output,
const IndexT* index, int64_t N,
int64_t stride, int64_t size,
int64_t delta) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t pre_idx = idx / (stride * size);
int64_t dim_idx = idx % (stride * size) / stride;
IndexT src_dim_idx = index[dim_idx];
int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
output[idx] = input[input_idx];
}
template <typename T, typename IndexT>
__global__ void index_select_grad_cuda_kernel(const T* output_grad,
T* input_grad,
const IndexT* index, int64_t nums,
int64_t N, int64_t stride,
int64_t size, int64_t delta) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t pre_idx = idx / (stride * size);
int64_t dim_idx = idx % (stride * size) / stride;
IndexT src_dim_idx = index[dim_idx];
int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
}
template <typename T>
__global__ void index_select_grad_init(T* input_grad, int64_t N) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
input_grad[idx] = 0.0;
}
template <typename DeviceContext, typename T>
class IndexSelectCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* index = context.Input<LoDTensor>("Index");
auto* out = context.Output<LoDTensor>("Out");
int dim = context.Attr<int>("dim");
auto input_dim = in->dims();
auto output_dim = out->dims();
dim = dim >= 0 ? dim : dim + input_dim.size();
auto stride_dim = phi::stride(input_dim);
int64_t stride = stride_dim[dim];
int64_t size = output_dim[dim];
int64_t delta = input_dim[dim] - size;
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
auto* in_data = in->data<T>();
auto* out_data = out->mutable_data<T>(context.GetPlace());
int64_t numel = out->numel();
auto stream =
context.template device_context<platform::CUDADeviceContext>().stream();
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
index_select_cuda_kernel<T, int64_t><<<
(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
numel, stride, size, delta);
platform::GpuStreamSync(stream);
} else {
const int* index_data = index->data<int>();
index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
in_data, out_data, index_data, numel, stride, size, delta);
platform::GpuStreamSync(stream);
}
}
};
template <typename DeviceContext, typename T>
class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto* in_grad = context.Output<LoDTensor>(framework::GradVarName("X"));
auto* index = context.Input<LoDTensor>("Index");
auto* output_grad_data = output_grad->data<T>();
auto* in_grad_data = in_grad->mutable_data<T>(context.GetPlace());
int dim = context.Attr<int>("dim");
auto input_dim = in_grad->dims();
auto output_dim = output_grad->dims();
dim = dim >= 0 ? dim : dim + input_dim.size();
auto stride_dim = phi::stride(input_dim);
int64_t stride = stride_dim[dim];
int64_t size = output_dim[dim];
int64_t delta = input_dim[dim] - size;
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT64 ||
index_type == framework::proto::VarType::INT32;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
int64_t numel = in_grad->numel();
int64_t index_nums = index->numel();
int64_t out_nums = output_grad->numel();
auto stream =
context.template device_context<platform::CUDADeviceContext>().stream();
index_select_grad_init<
T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
if (index_type == framework::proto::VarType::INT64) {
const int64_t* index_data = index->data<int64_t>();
index_select_grad_cuda_kernel<T, int64_t><<<
(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
index_data, index_nums,
out_nums, stride, size, delta);
platform::GpuStreamSync(stream);
} else {
const int* index_data = index->data<int>();
index_select_grad_cuda_kernel<T, int><<<
(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
index_data, index_nums,
out_nums, stride, size, delta);
platform::GpuStreamSync(stream);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
index_select,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
index_select_grad,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
int64_t>);
......@@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context,
output->Resize(output_dim);
}
template <typename DeviceContext, typename T>
class IndexSelectKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto inputs = *context.Input<framework::LoDTensor>("X");
auto* index = context.Input<framework::LoDTensor>("Index");
auto* output = context.Output<framework::LoDTensor>("Out");
int dim = context.Attr<int>("dim");
if (dim < 0) {
dim += inputs.dims().size();
}
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSelectInner<DeviceContext, T, int>(context, &inputs, *index, output,
dim);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSelectInner<DeviceContext, T, int64_t>(context, &inputs, *index,
output, dim);
}
}
};
template <typename DeviceContext, typename T, class Enable = void>
struct IndexSelectAdd {
void operator()(const framework::ExecutionContext& ctx, int slice_size,
......@@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
x_grad->Resize(output_dim);
}
template <typename DeviceContext, typename T>
class IndexSelectGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x_grad =
context.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto* index = context.Input<framework::LoDTensor>("Index");
auto* out_grad =
context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
int dim = context.Attr<int>("dim");
if (dim < 0) {
dim += out_grad->dims().size();
}
const auto& index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match, true,
platform::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
if (index_type == framework::proto::VarType::INT32) {
IndexSelectGradInner<DeviceContext, T, int>(context, *out_grad, *index,
x_grad, dim);
} else if (index_type == framework::proto::VarType::INT64) {
IndexSelectGradInner<DeviceContext, T, int64_t>(context, *out_grad,
*index, x_grad, dim);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/index_select_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class IndexSelectNPUKernel : public framework::OpKernel<T> {
public:
......
......@@ -14,10 +14,13 @@
#include <cmath>
#include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
......@@ -60,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose");
OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose");
auto input_dim = ctx->GetInputDim("Input");
auto other_dim = ctx->GetInputDim("Other");
PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
platform::errors::PreconditionNotMet(
"Input(Input) and Input(Other) must have the same "
"dimension size."));
int n = input_dim.size();
bool is_runtime = ctx->IsRuntime();
for (int i = 0; i < n; i++) {
if (is_runtime) {
PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
platform::errors::PreconditionNotMet(
"The value at dim %d of Input(Input) is not "
"equal to the Input(Other): %ld != %ld.",
i, input_dim[i], other_dim[i]));
} else {
if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
platform::errors::PreconditionNotMet(
"The value at dim %d of Input(Input) is not "
"equal to the Input(Other): %ld != %ld.",
i, input_dim[i], other_dim[i]));
}
}
}
ctx->SetOutputDim("Out", input_dim);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -115,8 +84,10 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference {
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor,
PD_INFER_META(phi::ValueCompareInferMeta));
REGISTER_OPERATOR(
isclose, ops::IscloseOp, ops::IscloseOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::IscloseOpVarTypeInference);
ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor);
......@@ -11,7 +11,9 @@
#include <memory>
#include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
......@@ -21,44 +23,6 @@ using framework::Tensor;
class KLDivLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss");
OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss");
OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss");
auto dim_x = ctx->GetInputDim("X");
auto dim_target = ctx->GetInputDim("Target");
PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
platform::errors::InvalidArgument(
"Input(X) rank and Input(Target) rank should be "
"same, but received X rank(%d) != Target rank(%d)",
dim_x.size(), dim_target.size()));
for (int i = 0; i < dim_x.size(); i++) {
if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
PADDLE_ENFORCE_EQ(
dim_x[i], dim_target[i],
platform::errors::InvalidArgument(
"Input(X) and Input(Target) should in same shape. but received "
"X dimension[%d](%d) != Target dimension[%d](%d)",
i, dim_x[i], i, dim_target[i]));
}
}
auto reduction = ctx->Attrs().Get<std::string>("reduction");
auto reduction_valid = "mean" == reduction || "sum" == reduction ||
"batchmean" == reduction || "none" == reduction;
PADDLE_ENFORCE_EQ(
reduction_valid, true,
platform::errors::InvalidArgument(
"Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
if ("none" == reduction) {
ctx->SetOutputDim("Loss", dim_x);
} else {
ctx->SetOutputDim("Loss", {1});
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
......@@ -171,8 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X");
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor,
PD_INFER_META(phi::KLDivInferMeta));
REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
ops::KLDivLossOpGradMaker<paddle::framework::OpDesc>,
ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>);
ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>,
KLDivInferShapeFunctor);
REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad,
ops::KLDivLossGradNoNeedBufferVarInferer);
......@@ -18,8 +18,8 @@ limitations under the License. */
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/operators/set_value_op.h"
#include "paddle/fluid/operators/svd_helper.h"
#include "paddle/fluid/operators/tril_triu_op.h"
#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
#include "paddle/phi/kernels/math_kernel.h"
#include "paddle/phi/kernels/triangular_solve_kernel.h"
......@@ -404,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU,
const auto W = udims[udims.size() - 1];
auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace());
platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel());
TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W, L_dataptr);
phi::funcs::TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W,
L_dataptr);
x_for_range(tril_computer);
TrilTriuCompute<T> triu_computer(LU->data<T>(), 0, false, H, W,
U->mutable_data<T>(dev_ctx.GetPlace()));
phi::funcs::TrilTriuCompute<T> triu_computer(
LU->data<T>(), 0, false, H, W, U->mutable_data<T>(dev_ctx.GetPlace()));
x_for_range(triu_computer);
// set L's diagonal 1
......@@ -532,14 +533,14 @@ class LUGradKernel : public framework::OpKernel<T> {
auto phil_rank = LmHdims.size();
auto phiu_rank = UmHdims.size();
platform::ForRange<DeviceContext> l_for_range(dev_ctx, phi_L.numel());
TrilTriuCompute<T> tril_computer(phi_L.data<T>(), -1, true,
LmHdims[phil_rank - 2],
phi::funcs::TrilTriuCompute<T> tril_computer(
phi_L.data<T>(), -1, true, LmHdims[phil_rank - 2],
LmHdims[phil_rank - 1], phi_L.data<T>());
l_for_range(tril_computer);
platform::ForRange<DeviceContext> u_for_range(dev_ctx, phi_U.numel());
TrilTriuCompute<T> triu_computer(phi_U.data<T>(), 0, false,
UmHdims[phiu_rank - 2],
phi::funcs::TrilTriuCompute<T> triu_computer(
phi_U.data<T>(), 0, false, UmHdims[phiu_rank - 2],
UmHdims[phiu_rank - 1], phi_U.data<T>());
u_for_range(triu_computer);
......@@ -591,8 +592,9 @@ class LUGradKernel : public framework::OpKernel<T> {
const auto W = phidims[phidims.size() - 1];
platform::ForRange<DeviceContext> x_for_range(dev_ctx,
phi_complement.numel());
TrilTriuCompute<T> tril_computer(phi_complement.data<T>(), -1, true, H,
W, phi_complement_l.data<T>());
phi::funcs::TrilTriuCompute<T> tril_computer(
phi_complement.data<T>(), -1, true, H, W,
phi_complement_l.data<T>());
x_for_range(tril_computer);
Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_l, &phi);
......@@ -664,8 +666,8 @@ class LUGradKernel : public framework::OpKernel<T> {
const auto W = phidims[phidims.size() - 1];
platform::ForRange<DeviceContext> x_for_range(dev_ctx,
phi_complement.numel());
TrilTriuCompute<T> triu_computer(phi_complement.data<T>(), 0, false, H, W,
phi_complement_u.data<T>());
phi::funcs::TrilTriuCompute<T> triu_computer(
phi_complement.data<T>(), 0, false, H, W, phi_complement_u.data<T>());
x_for_range(triu_computer);
Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_u, &phi);
......
......@@ -16,7 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/lu_op.h"
#include "paddle/fluid/operators/tril_triu_op.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
namespace paddle {
namespace operators {
......@@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
auto W = ldims[ldims.size() - 1];
auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace());
platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel());
TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W, L_dataptr);
phi::funcs::TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W,
L_dataptr);
l_for_range(tril_computer);
const auto udims = du->dims();
......@@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
W = udims[udims.size() - 1];
auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace());
platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel());
TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W, U_dataptr);
phi::funcs::TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W,
U_dataptr);
u_for_range(triu_computer);
auto xdims = dx->dims();
......
......@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/multiplex_op.h"
#include <memory>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
......@@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
ops::MultiplexGradMaker<paddle::framework::OpDesc>,
ops::MultiplexGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
REGISTER_OP_CPU_KERNEL(
multiplex,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
multiplex_grad,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/multiplex_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename Place, typename T>
class MultiplexGPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto ins = ctx.MultiInput<Tensor>("X");
auto* ids = ctx.Input<Tensor>("Ids");
auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
for (size_t i = 0; i < ins.size(); ++i) {
PADDLE_ENFORCE_GT(
ins[i]->numel(), 0,
platform::errors::OutOfRange(
"indexing will be out of bounds with size 0 for the %d-th input.",
i));
}
auto rows = ins[0]->dims()[0];
auto cols = ins[0]->numel() / rows;
// copy index to cpu
Tensor index_t_cpu;
paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream();
platform::CUDAPlace place = ctx.GetPlace();
for (auto i = 0; i < rows; i++) {
int32_t k = index[i];
PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
"index must be nonnegative."));
PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
platform::errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
memory::Copy(place, out->data<T>() + i * cols, place,
ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
}
}
};
template <typename Place, typename T>
class MultiplexGradGPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* ids = ctx.Input<Tensor>("Ids");
auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
size_t idx = -1UL;
for (size_t i = 0; i < d_ins.size(); i++) {
if (d_ins[i]) {
d_ins[i]->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
t.device(*ctx.template device_context<Place>().eigen_device()) =
t.constant(static_cast<T>(0));
idx = i;
}
}
if (idx == -1UL) return;
auto rows = d_ins[idx]->dims()[0];
auto cols = d_ins[idx]->numel() / rows;
// copy index to cpu
Tensor index_t_cpu;
paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream();
platform::CUDAPlace place = ctx.GetPlace();
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (d_ins[k]) {
memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
d_out->data<T>() + i * cols, cols * sizeof(T), stream);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
multiplex,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
multiplex_grad,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class MultiplexCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto ins = ctx.MultiInput<framework::Tensor>("X");
auto ids = ctx.Input<framework::Tensor>("Ids");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
for (size_t i = 0; i < ins.size(); ++i) {
PADDLE_ENFORCE_GT(
ins[i]->numel(), 0,
platform::errors::OutOfRange(
"indexing will be out of bounds with size 0 for the %d-th input.",
i));
}
auto rows = ins[0]->dims()[0];
auto cols = ins[0]->numel() / rows;
auto index = ids->data<int32_t>();
platform::CPUPlace place = ctx.GetPlace();
for (auto i = 0; i < rows; i++) {
int32_t k = index[i];
PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
"index must be nonnegative."));
PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
platform::errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
memory::Copy(place, out->data<T>() + i * cols, place,
ins[k]->data<T>() + i * cols, cols * sizeof(T));
}
}
};
template <typename DeviceContext, typename T>
class MultiplexGradCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* ids = ctx.Input<framework::Tensor>("Ids");
auto d_ins =
ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
size_t idx = -1UL;
for (size_t i = 0; i < d_ins.size(); i++) {
if (d_ins[i]) {
d_ins[i]->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
t.constant(static_cast<T>(0));
idx = i;
}
}
if (idx == -1UL) return;
auto rows = d_ins[idx]->dims()[0];
auto cols = d_ins[idx]->numel() / rows;
auto* index = ids->data<int32_t>();
platform::CPUPlace place = ctx.GetPlace();
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (d_ins[k]) {
memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
d_out->data<T>() + i * cols, cols * sizeof(T));
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel<float>, ops::QrCPUKernel<double>);
REGISTER_OP_CPU_KERNEL(
qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -48,85 +48,6 @@ static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
return std::make_tuple(compute_q, reduced);
}
template <typename T>
class QrCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool compute_q;
bool reduced_mode;
const Tensor& x = *context.Input<Tensor>("X");
Tensor& q = *context.Output<Tensor>("Q");
Tensor& r = *context.Output<Tensor>("R");
std::string mode = context.Attr<std::string>("mode");
std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
auto numel = x.numel();
PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
"The input of QR is empty."));
auto x_dims = x.dims();
int x_rank = x_dims.size();
int m = x_dims[x_rank - 2];
int n = x_dims[x_rank - 1];
int min_mn = std::min(m, n);
int k = reduced_mode ? min_mn : m;
int batch_size = numel / (m * n);
int x_stride = m * n;
int q_stride = m * k;
int r_stride = k * n;
auto* x_data = x.data<phi::dtype::Real<T>>();
T* q_data = nullptr;
if (compute_q) {
q_data = q.mutable_data<phi::dtype::Real<T>>(
context.GetPlace(),
size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
memset(q_data, 0,
size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
}
auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
context.GetPlace(),
size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
// Implement QR by calling Eigen
for (int i = 0; i < batch_size; ++i) {
const T* x_matrix_ptr = x_data + i * x_stride;
T* r_matrix_ptr = r_data + i * r_stride;
using EigenDynamicMatrix =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
if (reduced_mode) {
auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
auto r_matrix_view =
qr_top_matrix.template triangularView<Eigen::Upper>();
auto r_matrix = EigenDynamicMatrix(r_matrix_view);
memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
} else {
auto r_matrix_view =
qr.matrixQR().template triangularView<Eigen::Upper>();
auto r_matrix = EigenDynamicMatrix(r_matrix_view);
memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
}
if (compute_q) {
T* q_matrix_ptr = q_data + i * q_stride;
if (reduced_mode) {
auto q_matrix =
qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
q_matrix.transposeInPlace();
memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
} else {
auto q_matrix =
qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
q_matrix.transposeInPlace();
memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
}
}
}
}
};
template <typename DeviceContext, typename T>
class QrGradKernel : public framework::OpKernel<T> {
public:
......
......@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/roi_align_op.h"
#include <memory>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/ternary.h"
namespace paddle {
namespace operators {
......@@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::NotFound("Input(X) of ROIAlignOp "
"is not found."));
PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
platform::errors::NotFound("Input(ROIs) of ROIAlignOp "
"is not found."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::NotFound("Output(Out) of ROIAlignOp "
"is not found."));
auto input_dims = ctx->GetInputDim("X");
auto rois_dims = ctx->GetInputDim("ROIs");
if (ctx->HasInput("RoisNum")) {
auto rois_num_dims = ctx->GetInputDim("RoisNum");
PADDLE_ENFORCE_EQ(
rois_num_dims.size(), 1,
platform::errors::InvalidArgument("The size of RoisNum should be 1"
", but received size = %d",
rois_num_dims.size()));
}
PADDLE_ENFORCE_EQ(
input_dims.size(), 4,
platform::errors::InvalidArgument(
"The format of Input(X) in"
"RoIAlignOp is NCHW. And the rank of input must be 4. "
"But received rank = %d",
input_dims.size()));
PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument(
"The rank of Input(ROIs) "
"in RoIAlignOp should be 2. "
"But the rank of RoIs is %d",
rois_dims.size()));
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(rois_dims[1], 4,
platform::errors::InvalidArgument(
"The second dimension "
"of Input(ROIs) should be 4. But received the "
"dimension = %d",
rois_dims[1]));
}
int pooled_height = ctx->Attrs().Get<int>("pooled_height");
int pooled_width = ctx->Attrs().Get<int>("pooled_width");
float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
PADDLE_ENFORCE_GT(pooled_height, 0,
platform::errors::InvalidArgument(
"The 'pooled_height' attribute in RoIAlignOp is "
"invalid. The height must be greater than 0. But "
"received 'pooled_height' = %d",
pooled_height));
PADDLE_ENFORCE_GT(pooled_width, 0,
platform::errors::InvalidArgument(
"The 'pooled_width' attribute in RoIAlignOp is "
"invalid. The width must be greater than 0. But "
"received 'pooled_width' = %d",
pooled_width));
PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
platform::errors::InvalidArgument(
"The 'spatial_scale' attribute in RoIAlignOp is "
"invalid. The scale must be greater than 0. But "
"received 'spatial_scale' = %f",
spatial_scale));
auto out_dims = input_dims;
out_dims[0] = rois_dims[0];
out_dims[1] = input_dims[1];
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
ctx->SetOutputDim("Out", out_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X");
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor,
PD_INFER_META(phi::RoiAlignInferMeta));
REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
ops::ROIAlignGradMaker<paddle::framework::OpDesc>,
ops::ROIAlignGradMaker<paddle::imperative::OpBase>);
ops::ROIAlignGradMaker<paddle::imperative::OpBase>,
RoiAlignInferShapeFunctor);
REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
ops::RoiAlignGradNoNeedBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
roi_align_grad,
ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
REGISTER_OP_VERSION(roi_align)
.AddCheckpoint(
R"ROC(
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <limits>
#include <numeric>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <class T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
const T out_grad_this_bin, const T count,
T* batch_grad_data) {
int x_low, y_low, x_high, y_high;
T w1, w2, w3, w4;
if (y < -1.0 || y > height || x < -1.0 || x > width) {
w1 = w2 = w3 = w4 = 0;
x_low = x_high = y_low = y_high = -1;
return;
}
y = y <= 0 ? 0 : y;
x = x <= 0 ? 0 : x;
y_low = static_cast<int>(y);
x_low = static_cast<int>(x);
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = static_cast<T>(y_low);
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = static_cast<T>(x_low);
} else {
x_high = x_low + 1;
}
T ly = y - y_low, lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T diff1 = out_grad_this_bin * w1 / count;
T diff2 = out_grad_this_bin * w2 / count;
T diff3 = out_grad_this_bin * w3 / count;
T diff4 = out_grad_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
*(batch_grad_data + y_low * width + x_low) += diff1;
*(batch_grad_data + y_low * width + x_high) += diff2;
*(batch_grad_data + y_high * width + x_low) += diff3;
*(batch_grad_data + y_high * width + x_high) += diff4;
}
}
template <typename DeviceContext, typename T>
class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
auto in_dims = in->dims();
auto aligned = ctx.Attr<bool>("aligned");
int channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
if (!in_grad) {
return;
}
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({rois_num});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
int rois_batch_size;
if (ctx.HasInput("RoisNum")) {
auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
rois_batch_size = rois_num_t->numel();
auto* rois_num_data = rois_num_t->data<int>();
int start = 0;
for (int n = 0; n < rois_batch_size; ++n) {
for (int i = start; i < start + rois_num_data[n]; ++i) {
roi_batch_id_data[i] = n;
}
start += rois_num_data[n];
}
} else {
auto rois_lod = rois->lod().back();
rois_batch_size = rois_lod.size() - 1;
for (int n = 0; n < rois_batch_size; ++n) {
for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
}
in_grad->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<DeviceContext>();
phi::funcs::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, in_grad, static_cast<T>(0));
int output_grad_size = out_grad->numel();
if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) {
return;
}
const T* rois_data = rois->data<T>();
const T* out_grad_data = out_grad->data<T>();
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
auto in_stride = phi::stride(in->dims());
auto roi_stride = phi::stride(rois->dims());
auto out_stride = phi::stride(out_grad->dims());
T roi_offset = aligned ? T(0.5) : 0;
for (int n = 0; n < rois_num; ++n) {
int roi_batch_idx = roi_batch_id_data[n];
T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
T roi_width = roi_xmax - roi_xmin;
T roi_height = roi_ymax - roi_ymin;
roi_width = std::max(roi_width, static_cast<T>(1.));
roi_height = std::max(roi_height, static_cast<T>(1.));
if (!aligned) {
roi_width = std::max(roi_width, static_cast<T>(1.));
roi_height = std::max(roi_height, static_cast<T>(1.));
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
for (int c = 0; c < channels; ++c) {
T* batch_grad_data =
in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
const T* batch_out_grad_data =
out_grad_data + n * out_stride[0] + c * out_stride[1];
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int pool_index = ph * pooled_width + pw;
T out_grad_this_bin = batch_out_grad_data[pool_index];
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height);
int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_width / pooled_width);
T count = roi_bin_grid_h * roi_bin_grid_w;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_ymin + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_xmin + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
bilinear_interpolate_gradient(height, width, y, x,
out_grad_this_bin, count,
batch_grad_data);
}
}
}
}
}
rois_data += roi_stride[0];
}
}
};
} // namespace operators
} // namespace paddle
......@@ -12,13 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/roll_op.h"
#include <memory>
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
......@@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of RollOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::InvalidArgument(
"Output(Out) of RollOp should not be null."));
auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
if (!ctx->HasInput("ShiftsTensor")) {
if (dims.size() != 0) {
PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
platform::errors::InvalidArgument(
"When dims.size() != 0, dims.size() "
"should be equal to "
"shifts.size(). But received "
"dims.size() = %d, shifts.size() = %d",
dims.size(), shifts.size()));
} else {
PADDLE_ENFORCE_EQ(shifts.size(), 1,
platform::errors::InvalidArgument(
"When dims.size() == 0, shifts.size() "
"should be equal to 1, But received "
"shifts.size() = %d",
shifts.size()));
}
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
auto type = ctx->GetInputsVarType("X")[0];
if (type == framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X");
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor,
PD_INFER_META(phi::RollInferMeta));
REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker,
ops::RollGradMaker<paddle::framework::OpDesc>,
ops::RollGradMaker<paddle::imperative::OpBase>);
ops::RollGradMaker<paddle::imperative::OpBase>,
RollInferShapeFunctor);
REGISTER_OPERATOR(roll_grad, ops::RollGradOp,
ops::RollGradNoNeedBufferVarsInferer);
REGISTER_OP_CPU_KERNEL(
roll, ops::RollKernel<paddle::platform::CPUDeviceContext, float>,
ops::RollKernel<paddle::platform::CPUDeviceContext, double>,
ops::RollKernel<paddle::platform::CPUDeviceContext, int>,
ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::RollKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::RollKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
roll_grad, ops::RollGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::RollGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_VERSION(roll)
.AddCheckpoint(
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/roll_op.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/core/utils/array.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, size_t Rank>
__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
phi::Array<int64_t, Rank> shifts,
phi::Array<int64_t, Rank> strides,
phi::Array<int64_t, Rank> sizes) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) {
return;
}
int64_t output_idx = idx;
int64_t new_dim_idx = 0;
#pragma unroll
for (size_t i = 0; i < Rank; i++) {
new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
if (new_dim_idx >= sizes[i]) {
output_idx += (shifts[i] - sizes[i]) * strides[i];
} else {
output_idx += shifts[i] * strides[i];
}
}
output[output_idx] = input[idx];
}
template <typename T>
class RollKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
if (context.HasInput("ShiftsTensor")) {
const auto* shifts_tensor =
context.Input<framework::Tensor>("ShiftsTensor");
PADDLE_ENFORCE_EQ(
shifts_tensor->dims().size(), 1,
platform::errors::InvalidArgument(
"The rank of ShiftsTensor is expected to be 1, got %s",
shifts_tensor->dims().size()));
shifts = GetDataFromTensor<int64_t>(shifts_tensor);
}
std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
auto* in_data = in->data<T>();
auto* out_data = out->mutable_data<T>(context.GetPlace());
int64_t numel = in->numel();
auto stream =
context.template device_context<platform::CUDADeviceContext>().stream();
size_t nums = shifts.size();
auto input_dim = in->dims();
auto stride_dim = phi::stride(input_dim);
std::vector<int64_t> strides(nums), sizes(nums);
if (dims.size() == 0) {
strides[0] = 1;
sizes[0] = numel;
shifts[0] = (shifts[0] % numel + numel) % numel;
} else {
for (size_t i = 0; i < nums; i++) {
int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
int64_t size = input_dim[dim];
if (size != 0) {
shifts[i] = (shifts[i] % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
}
}
}
#define CALL_ROLL_CUDA_KERNEL(N) \
case N: { \
phi::Array<int64_t, N> _strides; \
phi::Array<int64_t, N> _shifts; \
phi::Array<int64_t, N> _sizes; \
for (size_t idx = 0; idx < N; ++idx) { \
_strides[idx] = strides[idx]; \
_shifts[idx] = shifts[idx]; \
_sizes[idx] = sizes[idx]; \
} \
RollCudaKernel< \
T, \
N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \
_shifts, _strides, _sizes); \
break; \
}
switch (nums) {
CALL_ROLL_CUDA_KERNEL(1);
CALL_ROLL_CUDA_KERNEL(2);
CALL_ROLL_CUDA_KERNEL(3);
CALL_ROLL_CUDA_KERNEL(4);
CALL_ROLL_CUDA_KERNEL(5);
CALL_ROLL_CUDA_KERNEL(6);
CALL_ROLL_CUDA_KERNEL(7);
CALL_ROLL_CUDA_KERNEL(8);
CALL_ROLL_CUDA_KERNEL(9);
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d",
shifts.size()));
}
}
};
template <typename T>
class RollGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
if (context.HasInput("ShiftsTensor")) {
const auto* shifts_tensor =
context.Input<framework::Tensor>("ShiftsTensor");
PADDLE_ENFORCE_EQ(
shifts_tensor->dims().size(), 1,
platform::errors::InvalidArgument(
"The rank of ShiftsTensor is expected to be 1, got %s",
shifts_tensor->dims().size()));
shifts = GetDataFromTensor<int64_t>(shifts_tensor);
}
std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
auto* in_data = in->data<T>();
auto* out_data = out->mutable_data<T>(context.GetPlace());
int64_t numel = in->numel();
auto stream =
context.template device_context<platform::CUDADeviceContext>().stream();
size_t nums = shifts.size();
auto input_dim = in->dims();
auto stride_dim = phi::stride(input_dim);
std::vector<int64_t> strides(nums), sizes(nums);
if (dims.size() == 0) {
strides[0] = 1;
sizes[0] = numel;
shifts[0] = ((-shifts[0]) % numel + numel) % numel;
} else {
for (size_t i = 0; i < nums; i++) {
int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
int64_t size = input_dim[dim];
if (size != 0) {
shifts[i] = ((-shifts[i]) % size + size) % size;
strides[i] = stride_dim[dim];
sizes[i] = size;
}
}
}
switch (nums) {
CALL_ROLL_CUDA_KERNEL(1);
CALL_ROLL_CUDA_KERNEL(2);
CALL_ROLL_CUDA_KERNEL(3);
CALL_ROLL_CUDA_KERNEL(4);
CALL_ROLL_CUDA_KERNEL(5);
CALL_ROLL_CUDA_KERNEL(6);
CALL_ROLL_CUDA_KERNEL(7);
CALL_ROLL_CUDA_KERNEL(8);
CALL_ROLL_CUDA_KERNEL(9);
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"shifts.size() should be less than 10, But received shifts.size() "
"= %d",
shifts.size()));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::RollKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::RollKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::RollGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/tril_triu_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
......@@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
REGISTER_OP_CPU_KERNEL(
tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
REGISTER_OP_CPU_KERNEL(
tril_triu_grad,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext,
plat::float16>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace operators {
template <typename T>
class TrilTriuCompute {
public:
HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower,
const int64_t H, const int64_t W, T* out)
: in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
HOSTDEVICE void operator()(int64_t idx) {
const int64_t row = (idx / W_) % H_;
const int64_t col = idx % W_;
const bool mask =
lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
out_[idx] = mask ? static_cast<T>(0) : in_[idx];
}
private:
const T* in_;
const int diagonal_;
const bool lower_;
const int64_t H_;
const int64_t W_;
T* out_;
};
template <typename DeviceContext, typename T>
class TrilTriuOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto* x = context.Input<framework::Tensor>("X");
const auto* x_data = x->data<T>();
auto* out = context.Output<framework::Tensor>("Out");
auto* out_data = out->mutable_data<T>(context.GetPlace());
const int diagonal = context.Attr<int>("diagonal");
const bool lower = context.Attr<bool>("lower");
const auto& dims = x->dims();
const auto H = dims[dims.size() - 2];
const auto W = dims[dims.size() - 1];
platform::ForRange<DeviceContext> for_range(
context.template device_context<DeviceContext>(),
static_cast<size_t>(x->numel()));
paddle::operators::TrilTriuCompute<T> tril_triu_computer(
x_data, diagonal, lower, H, W, out_data);
for_range(tril_triu_computer);
}
};
template <typename DeviceContext, typename T>
class TrilTriuGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto* d_out =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
const auto* dout_data = d_out->data<T>();
auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dx_data = d_x->mutable_data<T>(context.GetPlace());
const int diagonal = context.Attr<int>("diagonal");
const bool lower = context.Attr<bool>("lower");
const auto& dims = d_out->dims();
const auto H = dims[dims.size() - 2];
const auto W = dims[dims.size() - 1];
platform::ForRange<DeviceContext> for_range(
context.template device_context<DeviceContext>(),
static_cast<size_t>(d_out->numel()));
paddle::operators::TrilTriuCompute<T> tril_triu_grad_computer(
dout_data, diagonal, lower, H, W, dx_data);
for_range(tril_triu_grad_computer);
}
};
} // namespace operators
} // namespace paddle
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/tril_triu_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -11,7 +11,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/tril_triu_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
namespace paddle {
......
......@@ -143,6 +143,7 @@ void BindNode(py::module *m) {
.def("var", &Node::Var, return_value_policy::reference)
.def("op", &Node::Op, return_value_policy::reference)
.def("id", &Node::id)
.def("graph_id", &Node::GraphId)
.def("original_desc_id", &Node::OriginalDescId)
.def("is_op", &Node::IsOp)
.def("is_var", &Node::IsVar)
......
......@@ -114,6 +114,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/metrics_py.h"
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#include "paddle/phi/backends/device_manager.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
......@@ -742,6 +743,11 @@ PYBIND11_MODULE(core_noavx, m) {
// stored in this static instance to avoid illegal memory access.
m.def("clear_kernel_factory",
[]() { phi::KernelFactory::Instance().kernels().clear(); });
m.def("clear_device_manager", []() {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::Clear();
#endif
});
// NOTE(zjl): ctest would load environment variables at the beginning even
// though we have not `import paddle.fluid as fluid`. So we add this API
......
......@@ -134,6 +134,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
return DenseTensorType::get(
parser.getContext(), *targetType, *precisionType, *layoutType);
}
if (keyword == "dense_tensor_map") {
return DenseTensorMapType::get(parser.getContext());
}
// Todo: parse other type
return mlir::Type();
}
......@@ -156,7 +160,7 @@ void InfrtDialect::printType(::mlir::Type type,
}
// print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
if (type.isa<infrt::DenseTensorType>()) {
if (type.isa<DenseTensorType>()) {
auto dense_tensor_type = type.cast<infrt::DenseTensorType>();
os << "dense_tensor<" << dense_tensor_type.getTarget() << ", "
<< dense_tensor_type.getPrecision() << ", "
......@@ -164,6 +168,12 @@ void InfrtDialect::printType(::mlir::Type type,
return;
}
// print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
if (type.isa<DenseTensorMapType>()) {
os << "dense_tensor_map";
return;
}
llvm_unreachable("unknown infrt type.");
}
......
......@@ -18,7 +18,7 @@
#include "paddle/infrt/common/global.h"
#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
#include "paddle/infrt/dialect/mlir_loader.h"
#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
int main(int argc, char** argv) {
static llvm::cl::opt<std::string> input_file(
......
......@@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() {
return platform_manager;
}
void DeviceManager::Clear() {
Instance().device_map_.clear();
Instance().device_impl_map_.clear();
}
std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
std::vector<std::string> libraries;
std::regex express(".*\\.so");
......
......@@ -158,6 +158,8 @@ class DeviceManager {
static std::vector<size_t> GetDeviceList(const std::string& device_type);
static void Clear();
private:
DISABLE_COPY_AND_ASSIGN(DeviceManager);
DeviceManager() {}
......
......@@ -24,6 +24,10 @@ limitations under the License. */
namespace phi {
// Common InferMeta Functions for backward operators.
//
// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
void BilinearTensorProductGradInferMeta(const MetaTensor& x,
const MetaTensor& y,
const MetaTensor& weight,
......
......@@ -73,6 +73,51 @@ void AllValueCompareInferMeta(const MetaTensor& x,
out->set_dtype(DataType::BOOL);
}
void KLDivInferMeta(const MetaTensor& x,
const MetaTensor& label,
const std::string& reduction,
MetaTensor* out,
MetaConfig config) {
auto dim_x = x.dims();
auto dim_target = label.dims();
PADDLE_ENFORCE_EQ(dim_x.size(),
dim_target.size(),
phi::errors::InvalidArgument(
"Input(X) rank and Input(Target) rank should be "
"same, but received X rank(%d) != Target rank(%d)",
dim_x.size(),
dim_target.size()));
for (int i = 0; i < dim_x.size(); i++) {
if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) {
PADDLE_ENFORCE_EQ(
dim_x[i],
dim_target[i],
phi::errors::InvalidArgument(
"Input(X) and Input(Target) should in same shape. but received "
"X dimension[%d](%d) != Target dimension[%d](%d)",
i,
dim_x[i],
i,
dim_target[i]));
}
}
auto reduction_valid = "mean" == reduction || "sum" == reduction ||
"batchmean" == reduction || "none" == reduction;
PADDLE_ENFORCE_EQ(
reduction_valid,
true,
phi::errors::InvalidArgument(
"Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
if ("none" == reduction) {
out->set_dims(dim_x);
} else {
out->set_dims({1});
}
out->set_dtype(x.dtype());
}
void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
out->share_meta(x);
}
......@@ -431,6 +476,55 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
out->share_lod(x);
}
void GatherInferMeta(const MetaTensor& x,
const MetaTensor& index,
const Scalar& axis,
MetaTensor* out) {
auto index_dims = index.dims();
if (index_dims.size() == 2) {
PADDLE_ENFORCE_EQ(
index_dims[1],
1,
phi::errors::InvalidArgument(
"The last dim of index should be 1 when it is 2D, but we get %d",
index_dims[1]));
} else {
PADDLE_ENFORCE_EQ(
index_dims.size(),
1,
phi::errors::InvalidArgument(
"The index should be 1D, when it is not 2D, but we get %d",
index_dims.size()));
}
auto input_dim = x.dims();
auto axis_v = axis.to<int>();
if (axis.FromTensor() || axis_v == 0) {
// if axis.FromTensor(), we can not obtain correct shape of output
int batch_size = index_dims[0];
phi::DDim output_dims(input_dim);
output_dims[0] = batch_size;
out->set_dims(output_dims);
out->set_dtype(x.dtype());
out->share_lod(x);
} else {
int index_size = index_dims[0];
std::vector<int> out_dim_vec;
for (int i = 0; i < axis_v; i++) {
out_dim_vec.push_back(input_dim[i]);
}
out_dim_vec.push_back(index_size);
for (int i = axis_v + 1; i < input_dim.size(); i++) {
out_dim_vec.push_back(input_dim[i]);
}
auto output_dims = phi::make_ddim(out_dim_vec);
out->set_dims(output_dims);
out->set_dtype(x.dtype());
out->share_lod(x);
}
}
void GatherNdInferMeta(const MetaTensor& x,
const MetaTensor& index,
MetaTensor* out) {
......@@ -549,6 +643,49 @@ void IndexSampleInferMeta(const MetaTensor& x,
out->share_lod(y);
}
void IndexSelectInferMeta(const MetaTensor& x,
const MetaTensor& index,
int dim,
MetaTensor* output) {
auto input_dim = x.dims();
auto index_dim = index.dims();
PADDLE_ENFORCE_EQ(
dim < input_dim.size() && dim >= (0 - input_dim.size()),
true,
phi::errors::OutOfRange(
"Attr(dim) is out of range, It's expected "
"to be in range of [-%d, %d]. But received Attr(dim) = %d.",
input_dim.size(),
input_dim.size() - 1,
dim));
PADDLE_ENFORCE_EQ(
index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
true,
phi::errors::InvalidArgument(
"The 'shape' of Input(Index) must be 1-D tensor. "
"But received: the 'shape' of Input(Index) is [%s], "
"the dimension of Input(Index) is [%d].",
index_dim,
index_dim.size()));
PADDLE_ENFORCE_EQ(
index_dim[0] != 0,
true,
phi::errors::InvalidArgument("The length of Input(Index) can't be 0."));
auto output_dim = phi::vectorize(input_dim);
if (dim < 0) {
dim += input_dim.size();
}
output_dim[dim] = index_dim[0];
output->set_dims(phi::make_ddim(output_dim));
output->set_dtype(x.dtype());
output->set_layout(x.layout());
output->share_lod(x);
}
void LogLossInferMeta(const MetaTensor& input,
const MetaTensor& label,
float epsilon,
......@@ -813,6 +950,16 @@ void TriangularSolveInferMeta(const MetaTensor& x,
out->share_lod(y);
}
void ValueCompareInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config) {
detail::BinarySameInputDimsCheck(x, y, config);
out->set_dims(x.dims());
out->set_dtype(DataType::BOOL);
}
} // namespace phi
PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/meta_tensor.h"
namespace phi {
......@@ -28,12 +29,20 @@ namespace phi {
// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
// Because functions in this file not only can infer shape, but also need
// infer lod or other useful data.
//
// The InferMeta Functions in this file are arranged in alphabetic order.
void AllValueCompareInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config = MetaConfig());
void KLDivInferMeta(const MetaTensor& x,
const MetaTensor& label,
const std::string& reduction,
MetaTensor* out,
MetaConfig config = MetaConfig());
void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
void BCELossInferMeta(const MetaTensor& input,
......@@ -81,6 +90,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
int axis,
MetaTensor* out);
void GatherInferMeta(const MetaTensor& x,
const MetaTensor& index,
const Scalar& axis,
MetaTensor* out);
void GatherNdInferMeta(const MetaTensor& x,
const MetaTensor& index,
MetaTensor* out);
......@@ -101,6 +115,11 @@ void IndexSampleInferMeta(const MetaTensor& x,
MetaTensor* out,
MetaConfig config = MetaConfig());
void IndexSelectInferMeta(const MetaTensor& x,
const MetaTensor& index,
int dim,
MetaTensor* output);
void LogLossInferMeta(const MetaTensor& input,
const MetaTensor& label,
float epsilon,
......@@ -136,4 +155,9 @@ void TriangularSolveInferMeta(const MetaTensor& x,
bool unitriangular,
MetaTensor* out);
void ValueCompareInferMeta(const MetaTensor& x,
const MetaTensor& y,
MetaTensor* out,
MetaConfig config = MetaConfig());
} // namespace phi
......@@ -18,6 +18,23 @@ limitations under the License. */
#include "paddle/phi/core/meta_tensor.h"
namespace phi {
// Common InferMeta Functions for multiary operators, The format like:
//
// 1. The number of input MetaTensor is more than 3:
// void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
// const MetaTensor& y,
// const MetaTensor& z,
// const MetaTensor& w,
// ...,
// MetaTensor* out) {}
//
// 2. There are `const vector<MetaTensor*>&` in params:
// void [FunctionDesc|OpName]InferMeta(const vector<MetaTensor*>& x,
// ...,
// MetaTensor* out) {}
//
// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
void AdadeltaInferMeta(const MetaTensor& param,
......
......@@ -27,6 +27,8 @@ namespace phi {
// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
// Because functions in this file not only can infer shape, but also need
// infer lod or other useful data.
//
// The InferMeta Functions in this file are arranged in alphabetic order.
void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
......
......@@ -322,6 +322,83 @@ void NllLossRawInferMeta(const MetaTensor& input,
total_weight->set_dtype(input.dtype());
}
void RoiAlignInferMeta(const MetaTensor& x,
const MetaTensor& boxes,
paddle::optional<const MetaTensor&> boxes_num,
int pooled_height,
int pooled_width,
float spatial_scale,
int sampling_ratio,
bool aligned,
MetaTensor* out,
MetaConfig config) {
auto input_dims = x.dims();
auto boxes_dims = boxes.dims();
if (boxes_num) {
auto boxes_num_dims = boxes_num->dims();
PADDLE_ENFORCE_EQ(
boxes_num_dims.size(),
1,
phi::errors::InvalidArgument("The size of RoisNum should be 1"
", but received size = %d",
boxes_num_dims.size()));
}
PADDLE_ENFORCE_EQ(input_dims.size(),
4,
phi::errors::InvalidArgument(
"The format of Input(X) in"
"RoIAlignOp is NCHW. And the rank of input must be 4. "
"But received rank = %d",
input_dims.size()));
PADDLE_ENFORCE_EQ(boxes_dims.size(),
2,
phi::errors::InvalidArgument("The rank of Input(ROIs) "
"in RoIAlignOp should be 2. "
"But the rank of RoIs is %d",
boxes_dims.size()));
if (config.is_runtime) {
PADDLE_ENFORCE_EQ(boxes_dims[1],
4,
phi::errors::InvalidArgument(
"The second dimension "
"of Input(ROIs) should be 4. But received the "
"dimension = %d",
boxes_dims[1]));
}
PADDLE_ENFORCE_GT(pooled_height,
0,
phi::errors::InvalidArgument(
"The 'pooled_height' attribute in RoIAlignOp is "
"invalid. The height must be greater than 0. But "
"received 'pooled_height' = %d",
pooled_height));
PADDLE_ENFORCE_GT(pooled_width,
0,
phi::errors::InvalidArgument(
"The 'pooled_width' attribute in RoIAlignOp is "
"invalid. The width must be greater than 0. But "
"received 'pooled_width' = %d",
pooled_width));
PADDLE_ENFORCE_GT(spatial_scale,
0.0f,
phi::errors::InvalidArgument(
"The 'spatial_scale' attribute in RoIAlignOp is "
"invalid. The scale must be greater than 0. But "
"received 'spatial_scale' = %f",
spatial_scale));
auto out_dims = input_dims;
out_dims[0] = boxes_dims[0];
out_dims[1] = input_dims[1];
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
out->set_dims(out_dims);
out->set_dtype(x.dtype());
}
void ScatterInferMeta(const MetaTensor& x,
const MetaTensor& index,
const MetaTensor& updates,
......
......@@ -30,6 +30,8 @@ namespace phi {
// Because functions in this file not only can infer shape, but also need
// infer lod or other useful data.
//
// The InferMeta Functions in this file are arranged in alphabetic order.
void AccuracyInferMeta(const MetaTensor& out,
const MetaTensor& indice,
const MetaTensor& label,
......@@ -71,6 +73,17 @@ void NllLossRawInferMeta(const MetaTensor& input,
MetaTensor* total_weight,
MetaConfig config = MetaConfig());
void RoiAlignInferMeta(const MetaTensor& x,
const MetaTensor& boxes,
paddle::optional<const MetaTensor&> boxes_num,
int pooled_height,
int pooled_width,
float spatial_scale,
int sampling_ratio,
bool aligned,
MetaTensor* out,
MetaConfig config = MetaConfig());
void ScatterInferMeta(const MetaTensor& x,
const MetaTensor& index,
const MetaTensor& updates,
......
......@@ -1016,6 +1016,37 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
ReshapeInferMeta(x, shape, out, config);
}
void RollInferMeta(const MetaTensor& x,
const ScalarArray& shifts,
const std::vector<int64_t>& axis,
MetaTensor* out) {
auto shifts_data = shifts.GetData();
if (axis.size() != 0) {
PADDLE_ENFORCE_EQ(
axis.size(),
shifts_data.size(),
phi::errors::InvalidArgument("When dims.size() != 0, dims.size() "
"should be equal to "
"shifts.size(). But received "
"dims.size() = %d, shifts.size() = %d",
axis.size(),
shifts_data.size()));
} else {
PADDLE_ENFORCE_EQ(
shifts_data.size(),
1,
phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() "
"should be equal to 1, But received "
"shifts.size() = %d",
shifts_data.size()));
}
out->set_dims(x.dims());
out->share_lod(x);
out->set_dtype(x.dtype());
}
void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
auto in_dim = input.dims();
out->set_dims(phi::make_ddim({in_dim.size()}));
......
......@@ -31,6 +31,8 @@ class MetaConfig;
// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
// Because functions in this file not only can infer shape, but also need
// infer lod or other useful data.
//
// The InferMeta Functions in this file are arranged in alphabetic order.
void ArgMinMaxInferMeta(const MetaTensor& x,
int64_t axis,
......@@ -164,6 +166,11 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
MetaTensor* out,
MetaConfig config = MetaConfig());
void RollInferMeta(const MetaTensor& x,
const ScalarArray& shifts,
const std::vector<int64_t>& axis,
MetaTensor* out);
void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
void ShardIndexInferMeta(const MetaTensor& in,
......
......@@ -26,6 +26,23 @@ namespace phi {
const DenseTensor& dout, \
DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx);
#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
......@@ -33,6 +50,14 @@ namespace phi {
const DenseTensor& dout, \
DenseTensor* dx);
#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx);
template <typename T, typename Context>
void ReluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& out,
......@@ -59,34 +84,29 @@ void TanhTripleGradKernel(const Context& dev_ctx,
DenseTensor* d_ddx);
template <typename T, typename Context>
void BReluGradKernel(const Context& dev_ctx,
void LeakyReluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
float t_min,
float t_max,
DenseTensor* dx);
const DenseTensor& ddx,
float alpha,
DenseTensor* ddout);
template <typename T, typename Context>
void LeakyReluGradKernel(const Context& dev_ctx,
void EluGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out,
const DenseTensor& dout,
float alpha,
DenseTensor* dx);
template <typename T, typename Context>
void LeakyReluDoubleGradKernel(const Context& dev_ctx,
void EluDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
const DenseTensor& ddx,
float alpha,
DenseTensor* dx,
DenseTensor* ddout);
template <typename T, typename Context>
void ThresholdedReluGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
float threshold,
DenseTensor* dx);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
......@@ -98,7 +118,17 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(TanhShrink);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Silu);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, threshold)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(SoftShrink, lambda)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold)
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max)
} // namespace phi
......@@ -24,6 +24,21 @@ namespace phi {
void name##Kernel( \
const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
float attr, \
DenseTensor* out);
#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
float attr1, \
float attr2, \
DenseTensor* out);
DECLARE_ACTIVATION_KERNEL(Cos)
DECLARE_ACTIVATION_KERNEL(Tan)
DECLARE_ACTIVATION_KERNEL(Acos)
......@@ -37,24 +52,15 @@ DECLARE_ACTIVATION_KERNEL(Acosh)
DECLARE_ACTIVATION_KERNEL(Atanh)
DECLARE_ACTIVATION_KERNEL(Relu)
DECLARE_ACTIVATION_KERNEL(Tanh)
DECLARE_ACTIVATION_KERNEL(TanhShrink)
DECLARE_ACTIVATION_KERNEL(Silu)
template <typename T, typename Context>
void BReluKernel(const Context& dev_ctx,
const DenseTensor& x,
float t_min,
float t_max,
DenseTensor* out);
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
template <typename T, typename Context>
void LeakyReluKernel(const Context& dev_ctx,
const DenseTensor& x,
float alpha,
DenseTensor* out);
template <typename T, typename Context>
void ThresholdedReluKernel(const Context& dev_ctx,
const DenseTensor& x,
float threshold,
DenseTensor* out);
DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
} // namespace phi
......@@ -21,18 +21,18 @@ limitations under the License. */
namespace phi {
#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
DenseTensor* dx) { \
functor_class<T> functor; \
ActivationGradImpl<T, Context, functor_class<T>>( \
funcs::functor_class<T> functor; \
ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \
}
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \
name, functor_class, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
......@@ -40,14 +40,14 @@ namespace phi {
const DenseTensor& dout, \
float attr, \
DenseTensor* dx) { \
functor_class<T> functor; \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \
ActivationGradImpl<T, Context, functor_class<T>>( \
ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \
}
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX( \
name, functor_class, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
......@@ -56,26 +56,26 @@ namespace phi {
float attr1, \
float attr2, \
DenseTensor* dx) { \
functor_class<T> functor; \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationGradImpl<T, Context, functor_class<T>>( \
ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, &x, nullptr, &dout, dx, functor); \
}
#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
DenseTensor* dx) { \
functor_class<T> functor; \
ActivationGradImpl<T, Context, functor_class<T>>( \
funcs::functor_class<T> functor; \
ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \
name, functor_class, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
......@@ -83,39 +83,78 @@ namespace phi {
const DenseTensor& dout, \
float attr, \
DenseTensor* dx) { \
functor_class<T> functor; \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \
ActivationGradImpl<T, Context, functor_class<T>>( \
ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu,
funcs::LeakyReluGradFunctor,
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
LeakyReluGradFunctor,
alpha);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(
ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu,
funcs::BReluGradFunctor,
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
ThresholdedReluGradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
SoftShrinkGradFunctor,
lambda);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
HardShrinkGradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
BReluGradFunctor,
t_min,
t_max);
template <typename T, typename Context>
void EluGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out,
const DenseTensor& dout,
float alpha,
DenseTensor* dx) {
dev_ctx.template Alloc<T>(dx);
auto x_flatten =
EigenVector<T>::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad"));
auto out_flatten = EigenVector<T>::Flatten(
GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad"));
auto dout_flatten = EigenVector<T>::Flatten(
GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad"));
auto dx_flatten =
EigenVector<T>::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad"));
auto* place = dev_ctx.eigen_device();
if (alpha > 0) {
funcs::ELUGradFunctor<T> functor;
functor.alpha = alpha;
functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
} else {
funcs::ELUGradNegativeAlphaFunctor<T> functor;
functor.alpha = alpha;
functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
}
}
} // namespace phi
PD_REGISTER_KERNEL(
......@@ -144,6 +183,11 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
ThresholdedReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
ReluDoubleGradKernel)
......@@ -151,6 +195,7 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
TanhDoubleGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
LeakyReluDoubleGradKernel)
PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
PD_REGISTER_KERNEL(tanh_triple_grad,
CPU,
......
......@@ -23,8 +23,9 @@ namespace phi {
template <typename T, typename Context> \
void name##Kernel( \
const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
functor_class functor; \
ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
funcs::functor_class<T> functor; \
ActivationImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, x, out, functor); \
}
#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
......@@ -33,10 +34,11 @@ namespace phi {
const DenseTensor& x, \
float attr, \
DenseTensor* out) { \
functor_class<T> functor; \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr; \
ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
ActivationImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, x, out, functor); \
}
#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \
......@@ -47,50 +49,63 @@ namespace phi {
float attr1, \
float attr2, \
DenseTensor* out) { \
functor_class<T> functor; \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationImpl<T, Context, functor_class<T>>(dev_ctx, x, out, functor); \
ActivationImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, x, out, functor); \
}
DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor<T>)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha)
DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
funcs::ThresholdedReluFunctor,
ThresholdedReluFunctor,
threshold)
DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
} // namespace phi
PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {}
PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
PD_REGISTER_ACTIVATION_KERNEL(sin, Sin)
PD_REGISTER_ACTIVATION_KERNEL(cos, Cos)
PD_REGISTER_ACTIVATION_KERNEL(tan, Tan)
PD_REGISTER_ACTIVATION_KERNEL(acos, Acos)
PD_REGISTER_ACTIVATION_KERNEL(asin, Asin)
PD_REGISTER_ACTIVATION_KERNEL(atan, Atan)
PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh)
PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh)
PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh)
PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh)
PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh)
PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh)
PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu)
PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu)
PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu)
PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
此差异已折叠。
此差异已折叠。
此差异已折叠。
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/index_select_grad_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/cpu/index_select_impl.h"
namespace phi {
template <typename T, typename Context>
void IndexSelectGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& index,
const DenseTensor& out_grad,
int dim,
DenseTensor* x_grad) {
if (dim < 0) {
dim += out_grad.dims().size();
}
const auto& index_type = index.dtype();
bool index_type_match =
index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
PADDLE_ENFORCE_EQ(index_type_match,
true,
phi::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
index_type,
phi::DataType::INT32,
phi::DataType::INT64));
if (index_type == phi::DataType::INT32) {
IndexSelectGradInner<Context, T, int>(ctx, out_grad, index, x_grad, dim);
} else if (index_type == phi::DataType::INT64) {
IndexSelectGradInner<Context, T, int64_t>(
ctx, out_grad, index, x_grad, dim);
}
}
} // namespace phi
PD_REGISTER_KERNEL(index_select_grad,
CPU,
ALL_LAYOUT,
phi::IndexSelectGradKernel,
float,
double,
int,
int64_t) {}
此差异已折叠。
......@@ -12,32 +12,50 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/reduce_prod_kernel.h"
#include "paddle/phi/kernels/index_select_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/reduce.h"
#include "paddle/phi/kernels/funcs/reduce_functor.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/cpu/index_select_impl.h"
namespace phi {
template <typename T, typename Context>
void ReduceProdKernel(const Context& dev_ctx,
void IndexSelectKernel(const Context& ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
const DenseTensor& index,
int dim,
DenseTensor* output) {
auto inputs = x;
if (dim < 0) {
dim += inputs.dims().size();
}
const auto& index_type = index.dtype();
bool index_type_match =
index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
PADDLE_ENFORCE_EQ(index_type_match,
true,
phi::errors::InvalidArgument(
"Input(Index) holds the wrong type, it holds %s, but "
"desires to be %s or %s",
index_type,
phi::DataType::INT32,
phi::DataType::INT64));
if (index_type == phi::DataType::INT32) {
IndexSelectInner<Context, T, int>(ctx, &inputs, index, output, dim);
} else if (index_type == phi::DataType::INT64) {
IndexSelectInner<Context, T, int64_t>(ctx, &inputs, index, output, dim);
}
}
} // namespace phi
PD_REGISTER_KERNEL(reduce_prod,
PD_REGISTER_KERNEL(index_select,
CPU,
ALL_LAYOUT,
phi::ReduceProdKernel,
phi::IndexSelectKernel,
float,
double,
int,
......
......@@ -13,6 +13,8 @@
// limitations under the License.
#include "paddle/phi/kernels/lgamma_kernel.h"
#include <unsupported/Eigen/SpecialFunctions>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/for_range.h"
......
......@@ -23,7 +23,7 @@
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
#include "paddle/phi/kernels/math_kernel.h"
#include "paddle/phi/kernels/reduce_max_kernel.h"
#include "paddle/phi/kernels/reduce_kernel.h"
namespace phi {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -28,7 +28,7 @@
#include "paddle/phi/kernels/funcs/compare_functors.h"
#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
#include "paddle/phi/kernels/math_kernel.h"
#include "paddle/phi/kernels/reduce_max_kernel.h"
#include "paddle/phi/kernels/reduce_kernel.h"
namespace phi {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册